def start(GAME_NAME, MAX_EPISODE): env = gym.make(GAME_NAME) # create enviornment actor = Actor(env.observation_space, env.action_space) # create actor critic = Critic(env.observation_space, env.action_space) # create critic reward_per_epi = [] durations_per_epi = [] l_A = [] l_C = [] MAX_EPISODE = MAX_EPISODE RENDER = False MAX_EP_STEPS = 1000 #DISPLAY_REWARD_THRESHOLD=200 #print ("begin.\n\n") for i_episode in range(MAX_EPISODE): s = env.reset() critic.reset() actor.reset() track_r = [] for t in count(): if RENDER: env.render() a = actor.choose_action(s) s_, r, done, info = env.step(a) #if done: r = -20 # Penalty if die track_r.append(r) td_error, abs_error = critic.learn(s, r, s_) # Critic Learn actor.learn(s, a, td_error) # Actor Learn s = s_ #print ("... in episode (%d) step (%d)" % (i_episode+1,t)) if is_ipython: display.clear_output(wait=True) display.display(plt.gcf()) #env.render() if done or t >= MAX_EP_STEPS: # Episode finished, print results ep_rs_sum = sum(track_r) #if 'running_reward' not in globals(): # running_reward = ep_rs_sum #else: # running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 #if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering running_reward_avg = ep_rs_sum / float(t) reward_per_epi.append(ep_rs_sum) durations_per_epi.append(t) l_A.append(np.mean(actor._loss_)) l_C.append(np.mean(critic._loss_)) #print("episode:", i_episode, " reward:", ep_rs_sum) #plot(reward_per_epi, durations_per_epi, l_A, l_C) break return reward_per_epi, durations_per_epi, l_A, l_C
def run(): # build environment using openai gym env = gym.make('MountainCar-v0') env = env.unwrapped sess = tf.Session() # create an actor and critic actor = Actor(sess, n_actions=n_actions, n_features=n_features, lr=lr_actor) critic = Critic(sess, n_features=n_features, lr=lr_critic) # build the two networks actor.build_net() critic.build_net() sess.run(tf.global_variables_initializer()) # tf.summary.FileWriter("",sess.graph) # count steps step = 0 # env.render() for episode in range(n_episodes): s = env.reset() # comment the render() to speed up # env.render() # s returned by gym is a vector, we need to transform it into a matrix s = s[np.newaxis, :] a = actor.choose_action(s) while (True): step += 1 # a new transition s_, r, done, info = env.step(a) # in order to let s_ add one rank(matrix) s_ = s_[np.newaxis, :] a_ = actor.choose_action(s_) # calculate td_error td_error = critic.learn(s, s_) actor.learn(s, a, td_error) s = s_ if step % 500 == 0: print(step, s_) if done: print('arrive') print(s_) break
class NetworkAC(object): """docstring for NetworkAC.""" def __init__(self): tf.reset_default_graph() self.sess = tf.Session() self.actor = Actor(self.sess, \ n_features=Config.PLAYER_DIMENSION*(Config.DEFENDER_COUNT+Config.INTRUDER_COUNT), \ lr=Config.LEARNING_RATE_START, action_bound=[-math.pi, math.pi]) self.critic = Critic(self.sess, \ n_features=Config.PLAYER_DIMENSION*(Config.DEFENDER_COUNT+Config.INTRUDER_COUNT), \ lr=Config.LEARNING_RATE_START) self.sess.run(tf.global_variables_initializer()) def train(self, x, a, y, r): td_error = self.critic.learn(x, r, y) # gradient = grad[r + gamma * V(y_) - V(x_)] self.actor.learn(x, a, td_error) # true_gradient = grad[logPi(s,a) * td_error] def predict(self, state): action = self.actor.choose_action(state) value = self.critic.predict(state) return action, value
def start_p(GAME_NAME, BATCH_SIZE=32, MEMORY_CAPACITY=50000): env = gym.make(GAME_NAME) actor = Actor(env.observation_space, env.action_space) critic = Critic(env.observation_space, env.action_space) reward_per_epi = [] durations_per_epi = [] l_A = [] l_C = [] MAX_EPISODE = 200 RENDER = False MAX_EP_STEPS = 1000 DISPLAY_REWARD_THRESHOLD = 200 BATCH_SIZE = BATCH_SIZE MEMORY_CAPACITY = MEMORY_CAPACITY replay_memory = SumTreeMemoryBuffer(MEMORY_CAPACITY) #print "begin.\n\n" for i_episode in range(MAX_EPISODE): s = env.reset() track_r = [] critic._v_ = [] # clean critic loss buffer actor._loss_ = [] # clean actor loss buffer for t in count(): if RENDER: env.render() a = actor.choose_action(s) s_, r, done, info = env.step(a) ##if done: r = -20 # Penalty if die track_r.append(r) # ACER: Critic Actor with Experience Replay if not done: transition = np.hstack((s, a, r, s_)) replay_memory.save(transition) # Save non-final transition #print len(replay_memory) #print replay_memory.data #print replay_memory.gettree if len(replay_memory ) >= BATCH_SIZE: # memory capacity into batch size tree_idx, batch, ISWeights = replay_memory.sample( BATCH_SIZE) # Sample from memory s_b = np.asarray(batch[-1, 0:8]) # state s_b_n = np.asarray(batch[-1, 10:18]) # next state a_b = np.asarray(batch[-1, 8]) # action r_b = np.asarray(batch[-1, 9]) # reward # print("tree_idx: " + str(tree_idx)) #print(ISWeights) td_error, abs_error = critic.learn(s_b, r_b, s_b_n, ISWeights) # Critic Learn replay_memory.batch_update(tree_idx, abs_error) # Update T priority actor.learn(s_b, a_b, td_error) # Actor Learn # print("rd_error: " + str(td_error)) print("abs_error: " + str(abs_error)) s = s_ # print "... in episode (%d) step (%d)" % (i_episode+1,t) if is_ipython: display.clear_output(wait=True) display.display(plt.gcf()) #env.render() if done or t >= MAX_EP_STEPS: ep_rs_sum = sum(track_r) # if 'running_reward' not in globals(): # running_reward = ep_rs_sum # else: # running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering running_reward_avg = ep_rs_sum / float(t) reward_per_epi.append(ep_rs_sum) durations_per_epi.append( running_reward_avg) ## draw average reward here l_A.append(np.mean(actor._loss_)) l_C.append(np.mean(critic._loss_)) # print("episode:", i_episode, " reward:", ep_rs_sum) #plot(reward_per_epi, durations_per_epi, l_A, l_C) break return reward_per_epi, durations_per_epi, l_A, l_C
def start_er(GAME_NAME, BATCH_SIZE=32, MEMORY_CAPACITY=50000): #print ("make enviornment") env = gym.make(GAME_NAME) #print ("create actor, critic") actor = Actor(env.observation_space, env.action_space) critic = Critic(env.observation_space, env.action_space) reward_per_epi=[] durations_per_epi=[] l_A=[] l_C=[] MAX_EPISODE = 500 RENDER = False MAX_EP_STEPS= 1000 DISPLAY_REWARD_THRESHOLD=200 BATCH_SIZE=BATCH_SIZE MEMORY_CAPACITY=MEMORY_CAPACITY replay_memory = ReplayMemory(MEMORY_CAPACITY) #print ("begin.\n") for i_episode in range(MAX_EPISODE): s = env.reset() track_r = [] critic._v_=[] actor._loss_=[] for t in count(): if RENDER: env.render() a = actor.choose_action(s) s_, r, done, info = env.step(a) ##if done: r = -20 # Penalty if die track_r.append(r) # ACER learn from experience if not done: replay_memory.save(s, a, r, s_) # Save non-final transition into memeory if len(replay_memory) >= BATCH_SIZE: transitions = replay_memory.sample(BATCH_SIZE) # Sample from memory for training batch = Transition(*zip(*transitions)) s_b = np.asarray(batch.state) s_b_n = np.asarray(batch.next_state) a_b = np.asarray(batch.action).reshape(BATCH_SIZE, 1) r_b = np.asarray(batch.reward).reshape(BATCH_SIZE, 1) td_error, abs_error = critic.learn(s_b, r_b, s_b_n) # Critic Learn actor.learn(s_b, a_b, td_error) # Actor Learn ################## ################ s = s_ ##print ("... in episode (%d) step (%d)" % (i_episode+1,t)) if is_ipython: display.clear_output(wait=True) display.display(plt.gcf()) #env.render() if done or t >= MAX_EP_STEPS: # Episode finished, print results ep_rs_sum = sum(track_r) #if 'running_reward' not in globals(): # running_reward = ep_rs_sum #else: # running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 #if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering running_reward_avg = ep_rs_sum/float(t) reward_per_epi.append(ep_rs_sum) durations_per_epi.append(t) l_A.append(np.mean(actor._loss_)) l_C.append(np.mean(critic._loss_)) #print("episode:", i_episode, " reward:", ep_rs_sum) #plot(reward_per_epi, durations_per_epi, l_A, l_C) break return reward_per_epi, durations_per_epi, l_A, l_C
def start_er(GAME_NAME, BATCH_SIZE=32, MEMORY_CAPACITY=50000): #print ("make enviornment") env = gym.make(GAME_NAME) #print ("create actor, critic") actor = Actor(env.observation_space, env.action_space) critic = Critic(env.observation_space, env.action_space) reward_per_epi = [] durations_per_epi = [] l_A = [] l_C = [] MAX_EPISODE = 200 RENDER = False MAX_EP_STEPS = 1000 DISPLAY_REWARD_THRESHOLD = 200 BATCH_SIZE = BATCH_SIZE MEMORY_CAPACITY = MEMORY_CAPACITY replay_memory_1 = ReplayMemory(MEMORY_CAPACITY) replay_memory_2 = ReplayMemory(MEMORY_CAPACITY) f_1 = BATCH_SIZE / 2 # define fraction for 2 buckets f_2 = BATCH_SIZE / 2 #print ("begin.\n") for i_episode in range(MAX_EPISODE): s = env.reset() track_r = [] critic._v_ = [] actor._loss_ = [] for t in count(): if RENDER: env.render() a = actor.choose_action(s) s_, r, done, info = env.step(a) track_r.append(r) if not done: replay_memory_1.save( s, a, r, s_) if r > 0 else replay_memory_2.save( s, a, r, s_) # Save non-final transition into memory #learn form memory if len(replay_memory_1) >= f_1 and len( replay_memory_2 ) >= f_2: # if positive D is enough, the other must as well transitions_1 = replay_memory_1.sample( f_1) # Sample from 2 buckets batch1 = Transition(*zip(*transitions_1)) transitions_2 = replay_memory_2.sample(f_2) batch2 = Transition(*zip(*transitions_2)) s_b = np.append(np.asarray(batch1.state), np.asarray(batch2.state), axis=0) s_b_n = np.append(np.asarray(batch1.next_state), np.asarray(batch2.next_state), axis=0) a_b = np.append(np.asarray(batch1.action).reshape(f_1, 1), np.asarray(batch2.action).reshape(f_2, 1), axis=0) r_b = np.append(np.asarray(batch1.reward).reshape(f_1, 1), np.asarray(batch2.reward).reshape(f_2, 1), axis=0) td_error, abs_error = critic.learn(s_b, r_b, s_b_n) # Critic Learn actor.learn(s_b, a_b, td_error) # Actor Learn s = s_ ##print ("... in episode (%d) step (%d)" % (i_episode+1,t)) if is_ipython: display.clear_output(wait=True) display.display(plt.gcf()) #env.render() if done or t >= MAX_EP_STEPS: # Episode finished, print results ep_rs_sum = sum(track_r) #if 'running_reward' not in globals(): # running_reward = ep_rs_sum #else: # running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 #if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering running_reward_avg = ep_rs_sum / float(t) reward_per_epi.append(ep_rs_sum) durations_per_epi.append(t) l_A.append(np.mean(actor._loss_)) l_C.append(np.mean(critic._loss_)) #print("episode:", i_episode, " reward:", ep_rs_sum) #plot(reward_per_epi, durations_per_epi, l_A, l_C) break return reward_per_epi, durations_per_epi, l_A, l_C
track_r = [] while True: if RENDER: env.render() a = actor.choose_action(s) s_, r, done, info = env.step(a) if done: r = -20 track_r.append(r) td_error = critic.learn(s, r, s_) actor.learn(s, a, td_error) s = s_ t += 1 if done or t >= MAX_EP_STEPS: ep_rs_sum = sum(track_r) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering print("episode:", epoch, " reward:", int(ep_rs_sum)) break
s = env.reset() t = 0 track_r = [] while True: if RENDER: env.render() a = actor.choose_action(s) s_,r,done,info = env.step(a) if done: r = -20 track_r.append(r) td_error = critic.learn(s,r,s_) actor.learn(s,a,td_error) s = s_ t += 1 if done or t >= MAX_EP_STEPS: ep_rs_sum = sum(track_r) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering print("episode:", i_episode, " reward:", int(running_reward)) break
class DDPG(object): def __init__(self, a_dim, s_dim): self.sess = tf.Session() self.a_dim, self.s_dim = a_dim, s_dim self.LR_A = 0.001 self.LR_C = 0.001 self.CAPACITY = 10000 self.BATCH_SIZE = 32 self.BATCH_SIZE_g = 24 self.SETTING = { 'GAMMA': 0.9, 'TAU': 0.01, 'N_D_MAX': 1 / np.sqrt(self.s_dim), 'N_D_MIN': -1 / np.sqrt(self.s_dim), 'F_N_D_MAX': 3e-3, 'F_N_D_MIN': -3e-3, 'L2_DECAY': 0.01, } self.S = tf.placeholder(tf.float32, shape=[None, self.s_dim], name='State') self.S_ = tf.placeholder(tf.float32, shape=[None, self.s_dim], name='State_') self.R = tf.placeholder(tf.float32, shape=[None, 1], name='Reward') self.actor = Actor(self.sess, self.a_dim, self.s_dim, self.LR_A, self.SETTING, self.S, self.S_) self.critic = Critic(self.sess, self.a_dim, self.s_dim, self.LR_C, self.SETTING, self.S, self.S_, self.R, self.actor.action, self.actor.action_) self.actor.add_grad_to_graph(self.critic.a_grads) self.memory = Memory(self.CAPACITY, s_dim * 2 + a_dim + 1, self.BATCH_SIZE) # self.memory_g = Memory(self.CAPACITY, s_dim * 2 + a_dim + 1, self.BATCH_SIZE_g) self.sess.run(tf.global_variables_initializer()) tf.summary.FileWriter('logs/', self.sess.graph) def store_transition(self, state, action, reward, state_): self.memory.store_transition(state, action, reward, state_) # def store_transition_g(self, state, action, reward, state_): # self.memory_g.store_transition(state, action, reward, state_) def learn(self): if self.memory.pointer > self.memory.capacity: bt = self.memory.sample() bs = bt[:, :self.s_dim] ba = bt[:, self.s_dim:self.s_dim + self.a_dim] br = bt[:, -self.s_dim - 1:-self.s_dim] bs_ = bt[:, -self.s_dim:] self.critic.learn(bs, ba, br, bs_) self.actor.learn(bs) # bt = self.memory_g.sample() # bs = bt[:, :self.s_dim] # ba = bt[:, self.s_dim:self.s_dim + self.a_dim] # br = bt[:, -self.s_dim - 1:-self.s_dim] # bs_ = bt[:, -self.s_dim:] # self.critic.learn(bs, ba, br, bs_) # self.actor.learn(bs) def save(self): saver = tf.train.Saver() saver.save(self.sess, './params/params', write_meta_graph=False) def load(self): saver = tf.train.Saver() saver.restore(self.sess, './params/params')