def start(GAME_NAME, MAX_EPISODE): env = gym.make(GAME_NAME) # create enviornment actor = Actor(env.observation_space, env.action_space) # create actor critic = Critic(env.observation_space, env.action_space) # create critic reward_per_epi = [] durations_per_epi = [] l_A = [] l_C = [] MAX_EPISODE = MAX_EPISODE RENDER = False MAX_EP_STEPS = 1000 #DISPLAY_REWARD_THRESHOLD=200 #print ("begin.\n\n") for i_episode in range(MAX_EPISODE): s = env.reset() critic.reset() actor.reset() track_r = [] for t in count(): if RENDER: env.render() a = actor.choose_action(s) s_, r, done, info = env.step(a) #if done: r = -20 # Penalty if die track_r.append(r) td_error, abs_error = critic.learn(s, r, s_) # Critic Learn actor.learn(s, a, td_error) # Actor Learn s = s_ #print ("... in episode (%d) step (%d)" % (i_episode+1,t)) if is_ipython: display.clear_output(wait=True) display.display(plt.gcf()) #env.render() if done or t >= MAX_EP_STEPS: # Episode finished, print results ep_rs_sum = sum(track_r) #if 'running_reward' not in globals(): # running_reward = ep_rs_sum #else: # running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 #if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering running_reward_avg = ep_rs_sum / float(t) reward_per_epi.append(ep_rs_sum) durations_per_epi.append(t) l_A.append(np.mean(actor._loss_)) l_C.append(np.mean(critic._loss_)) #print("episode:", i_episode, " reward:", ep_rs_sum) #plot(reward_per_epi, durations_per_epi, l_A, l_C) break return reward_per_epi, durations_per_epi, l_A, l_C
def run(): # build environment using openai gym env = gym.make('MountainCar-v0') env = env.unwrapped sess = tf.Session() # create an actor and critic actor = Actor(sess, n_actions=n_actions, n_features=n_features, lr=lr_actor) critic = Critic(sess, n_features=n_features, lr=lr_critic) # build the two networks actor.build_net() critic.build_net() sess.run(tf.global_variables_initializer()) # tf.summary.FileWriter("",sess.graph) # count steps step = 0 # env.render() for episode in range(n_episodes): s = env.reset() # comment the render() to speed up # env.render() # s returned by gym is a vector, we need to transform it into a matrix s = s[np.newaxis, :] a = actor.choose_action(s) while (True): step += 1 # a new transition s_, r, done, info = env.step(a) # in order to let s_ add one rank(matrix) s_ = s_[np.newaxis, :] a_ = actor.choose_action(s_) # calculate td_error td_error = critic.learn(s, s_) actor.learn(s, a, td_error) s = s_ if step % 500 == 0: print(step, s_) if done: print('arrive') print(s_) break
class NetworkAC(object): """docstring for NetworkAC.""" def __init__(self): tf.reset_default_graph() self.sess = tf.Session() self.actor = Actor(self.sess, \ n_features=Config.PLAYER_DIMENSION*(Config.DEFENDER_COUNT+Config.INTRUDER_COUNT), \ lr=Config.LEARNING_RATE_START, action_bound=[-math.pi, math.pi]) self.critic = Critic(self.sess, \ n_features=Config.PLAYER_DIMENSION*(Config.DEFENDER_COUNT+Config.INTRUDER_COUNT), \ lr=Config.LEARNING_RATE_START) self.sess.run(tf.global_variables_initializer()) def train(self, x, a, y, r): td_error = self.critic.learn(x, r, y) # gradient = grad[r + gamma * V(y_) - V(x_)] self.actor.learn(x, a, td_error) # true_gradient = grad[logPi(s,a) * td_error] def predict(self, state): action = self.actor.choose_action(state) value = self.critic.predict(state) return action, value
def start_p(GAME_NAME, BATCH_SIZE=32, MEMORY_CAPACITY=50000): env = gym.make(GAME_NAME) actor = Actor(env.observation_space, env.action_space) critic = Critic(env.observation_space, env.action_space) reward_per_epi = [] durations_per_epi = [] l_A = [] l_C = [] MAX_EPISODE = 200 RENDER = False MAX_EP_STEPS = 1000 DISPLAY_REWARD_THRESHOLD = 200 BATCH_SIZE = BATCH_SIZE MEMORY_CAPACITY = MEMORY_CAPACITY replay_memory = SumTreeMemoryBuffer(MEMORY_CAPACITY) #print "begin.\n\n" for i_episode in range(MAX_EPISODE): s = env.reset() track_r = [] critic._v_ = [] # clean critic loss buffer actor._loss_ = [] # clean actor loss buffer for t in count(): if RENDER: env.render() a = actor.choose_action(s) s_, r, done, info = env.step(a) ##if done: r = -20 # Penalty if die track_r.append(r) # ACER: Critic Actor with Experience Replay if not done: transition = np.hstack((s, a, r, s_)) replay_memory.save(transition) # Save non-final transition #print len(replay_memory) #print replay_memory.data #print replay_memory.gettree if len(replay_memory ) >= BATCH_SIZE: # memory capacity into batch size tree_idx, batch, ISWeights = replay_memory.sample( BATCH_SIZE) # Sample from memory s_b = np.asarray(batch[-1, 0:8]) # state s_b_n = np.asarray(batch[-1, 10:18]) # next state a_b = np.asarray(batch[-1, 8]) # action r_b = np.asarray(batch[-1, 9]) # reward # print("tree_idx: " + str(tree_idx)) #print(ISWeights) td_error, abs_error = critic.learn(s_b, r_b, s_b_n, ISWeights) # Critic Learn replay_memory.batch_update(tree_idx, abs_error) # Update T priority actor.learn(s_b, a_b, td_error) # Actor Learn # print("rd_error: " + str(td_error)) print("abs_error: " + str(abs_error)) s = s_ # print "... in episode (%d) step (%d)" % (i_episode+1,t) if is_ipython: display.clear_output(wait=True) display.display(plt.gcf()) #env.render() if done or t >= MAX_EP_STEPS: ep_rs_sum = sum(track_r) # if 'running_reward' not in globals(): # running_reward = ep_rs_sum # else: # running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering running_reward_avg = ep_rs_sum / float(t) reward_per_epi.append(ep_rs_sum) durations_per_epi.append( running_reward_avg) ## draw average reward here l_A.append(np.mean(actor._loss_)) l_C.append(np.mean(critic._loss_)) # print("episode:", i_episode, " reward:", ep_rs_sum) #plot(reward_per_epi, durations_per_epi, l_A, l_C) break return reward_per_epi, durations_per_epi, l_A, l_C
def start_er(GAME_NAME, BATCH_SIZE=32, MEMORY_CAPACITY=50000): #print ("make enviornment") env = gym.make(GAME_NAME) #print ("create actor, critic") actor = Actor(env.observation_space, env.action_space) critic = Critic(env.observation_space, env.action_space) reward_per_epi=[] durations_per_epi=[] l_A=[] l_C=[] MAX_EPISODE = 500 RENDER = False MAX_EP_STEPS= 1000 DISPLAY_REWARD_THRESHOLD=200 BATCH_SIZE=BATCH_SIZE MEMORY_CAPACITY=MEMORY_CAPACITY replay_memory = ReplayMemory(MEMORY_CAPACITY) #print ("begin.\n") for i_episode in range(MAX_EPISODE): s = env.reset() track_r = [] critic._v_=[] actor._loss_=[] for t in count(): if RENDER: env.render() a = actor.choose_action(s) s_, r, done, info = env.step(a) ##if done: r = -20 # Penalty if die track_r.append(r) # ACER learn from experience if not done: replay_memory.save(s, a, r, s_) # Save non-final transition into memeory if len(replay_memory) >= BATCH_SIZE: transitions = replay_memory.sample(BATCH_SIZE) # Sample from memory for training batch = Transition(*zip(*transitions)) s_b = np.asarray(batch.state) s_b_n = np.asarray(batch.next_state) a_b = np.asarray(batch.action).reshape(BATCH_SIZE, 1) r_b = np.asarray(batch.reward).reshape(BATCH_SIZE, 1) td_error, abs_error = critic.learn(s_b, r_b, s_b_n) # Critic Learn actor.learn(s_b, a_b, td_error) # Actor Learn ################## ################ s = s_ ##print ("... in episode (%d) step (%d)" % (i_episode+1,t)) if is_ipython: display.clear_output(wait=True) display.display(plt.gcf()) #env.render() if done or t >= MAX_EP_STEPS: # Episode finished, print results ep_rs_sum = sum(track_r) #if 'running_reward' not in globals(): # running_reward = ep_rs_sum #else: # running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 #if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering running_reward_avg = ep_rs_sum/float(t) reward_per_epi.append(ep_rs_sum) durations_per_epi.append(t) l_A.append(np.mean(actor._loss_)) l_C.append(np.mean(critic._loss_)) #print("episode:", i_episode, " reward:", ep_rs_sum) #plot(reward_per_epi, durations_per_epi, l_A, l_C) break return reward_per_epi, durations_per_epi, l_A, l_C
def start_er(GAME_NAME, BATCH_SIZE=32, MEMORY_CAPACITY=50000): #print ("make enviornment") env = gym.make(GAME_NAME) #print ("create actor, critic") actor = Actor(env.observation_space, env.action_space) critic = Critic(env.observation_space, env.action_space) reward_per_epi = [] durations_per_epi = [] l_A = [] l_C = [] MAX_EPISODE = 200 RENDER = False MAX_EP_STEPS = 1000 DISPLAY_REWARD_THRESHOLD = 200 BATCH_SIZE = BATCH_SIZE MEMORY_CAPACITY = MEMORY_CAPACITY replay_memory_1 = ReplayMemory(MEMORY_CAPACITY) replay_memory_2 = ReplayMemory(MEMORY_CAPACITY) f_1 = BATCH_SIZE / 2 # define fraction for 2 buckets f_2 = BATCH_SIZE / 2 #print ("begin.\n") for i_episode in range(MAX_EPISODE): s = env.reset() track_r = [] critic._v_ = [] actor._loss_ = [] for t in count(): if RENDER: env.render() a = actor.choose_action(s) s_, r, done, info = env.step(a) track_r.append(r) if not done: replay_memory_1.save( s, a, r, s_) if r > 0 else replay_memory_2.save( s, a, r, s_) # Save non-final transition into memory #learn form memory if len(replay_memory_1) >= f_1 and len( replay_memory_2 ) >= f_2: # if positive D is enough, the other must as well transitions_1 = replay_memory_1.sample( f_1) # Sample from 2 buckets batch1 = Transition(*zip(*transitions_1)) transitions_2 = replay_memory_2.sample(f_2) batch2 = Transition(*zip(*transitions_2)) s_b = np.append(np.asarray(batch1.state), np.asarray(batch2.state), axis=0) s_b_n = np.append(np.asarray(batch1.next_state), np.asarray(batch2.next_state), axis=0) a_b = np.append(np.asarray(batch1.action).reshape(f_1, 1), np.asarray(batch2.action).reshape(f_2, 1), axis=0) r_b = np.append(np.asarray(batch1.reward).reshape(f_1, 1), np.asarray(batch2.reward).reshape(f_2, 1), axis=0) td_error, abs_error = critic.learn(s_b, r_b, s_b_n) # Critic Learn actor.learn(s_b, a_b, td_error) # Actor Learn s = s_ ##print ("... in episode (%d) step (%d)" % (i_episode+1,t)) if is_ipython: display.clear_output(wait=True) display.display(plt.gcf()) #env.render() if done or t >= MAX_EP_STEPS: # Episode finished, print results ep_rs_sum = sum(track_r) #if 'running_reward' not in globals(): # running_reward = ep_rs_sum #else: # running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 #if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering running_reward_avg = ep_rs_sum / float(t) reward_per_epi.append(ep_rs_sum) durations_per_epi.append(t) l_A.append(np.mean(actor._loss_)) l_C.append(np.mean(critic._loss_)) #print("episode:", i_episode, " reward:", ep_rs_sum) #plot(reward_per_epi, durations_per_epi, l_A, l_C) break return reward_per_epi, durations_per_epi, l_A, l_C
critic = Critic(sess=sess, n_features=N_F, gamma=GAMMA, lr=LR_C) sess.run(tf.global_variables_initializer()) if OUTPUT_GRAPH: tf.summary.FileWriter('logs/', sess.graph) for epoch in range(MAX_EPISODE): s = env.reset() t = 0 track_r = [] while True: if RENDER: env.render() a = actor.choose_action(s) s_, r, done, info = env.step(a) if done: r = -20 track_r.append(r) td_error = critic.learn(s, r, s_) actor.learn(s, a, td_error) s = s_ t += 1 if done or t >= MAX_EP_STEPS:
actor.build_net() critic.build_net() sess.run(tf.global_variables_initializer()) #tf.summary.FileWriter("",sess.graph) #count steps step = 0 #env.render() for episode in range(n_episode): s = env.reset() #comment the render() to speed up #env.render() #s returned by gym is a vector, we need to transform it into a matrix s = s[np.newaxis, :] a = actor.choose_action(s) while(True): step += 1 #a new transition s_, r, done, info = env.step(a) #in order to let s_ add one rank(matrix) s_ = s_[np.newaxis,:] a_ = actor.choose_action(s_) #calculate td_error td_error = critic.learn(s,s_) actor.learn(s,a,td_error) s =s_ if step%500 == 0: print(step,s_)
critic = Critic(sess,n_features=n_features,gamma = GAMMA,lr = LR_C) sess.run(tf.global_variables_initializer()) if OUTPUT_GRAPH: tf.summary.FileWriter("logs/", sess.graph) for i_episode in range(MAX_EPISODE): s = env.reset() t = 0 track_r = [] while True: if RENDER: env.render() a = actor.choose_action(s) s_,r,done,info = env.step(a) if done: r = -20 track_r.append(r) td_error = critic.learn(s,r,s_) actor.learn(s,a,td_error) s = s_ t += 1 if done or t >= MAX_EP_STEPS: ep_rs_sum = sum(track_r)