def start_er(GAME_NAME, BATCH_SIZE=32, MEMORY_CAPACITY=50000): #print ("make enviornment") env = gym.make(GAME_NAME) #print ("create actor, critic") actor = Actor(env.observation_space, env.action_space) critic = Critic(env.observation_space, env.action_space) reward_per_epi=[] durations_per_epi=[] l_A=[] l_C=[] MAX_EPISODE = 500 RENDER = False MAX_EP_STEPS= 1000 DISPLAY_REWARD_THRESHOLD=200 BATCH_SIZE=BATCH_SIZE MEMORY_CAPACITY=MEMORY_CAPACITY replay_memory = ReplayMemory(MEMORY_CAPACITY) #print ("begin.\n") for i_episode in range(MAX_EPISODE): s = env.reset() track_r = [] critic._v_=[] actor._loss_=[] for t in count(): if RENDER: env.render() a = actor.choose_action(s) s_, r, done, info = env.step(a) ##if done: r = -20 # Penalty if die track_r.append(r) # ACER learn from experience if not done: replay_memory.save(s, a, r, s_) # Save non-final transition into memeory if len(replay_memory) >= BATCH_SIZE: transitions = replay_memory.sample(BATCH_SIZE) # Sample from memory for training batch = Transition(*zip(*transitions)) s_b = np.asarray(batch.state) s_b_n = np.asarray(batch.next_state) a_b = np.asarray(batch.action).reshape(BATCH_SIZE, 1) r_b = np.asarray(batch.reward).reshape(BATCH_SIZE, 1) td_error, abs_error = critic.learn(s_b, r_b, s_b_n) # Critic Learn actor.learn(s_b, a_b, td_error) # Actor Learn ################## ################ s = s_ ##print ("... in episode (%d) step (%d)" % (i_episode+1,t)) if is_ipython: display.clear_output(wait=True) display.display(plt.gcf()) #env.render() if done or t >= MAX_EP_STEPS: # Episode finished, print results ep_rs_sum = sum(track_r) #if 'running_reward' not in globals(): # running_reward = ep_rs_sum #else: # running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 #if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering running_reward_avg = ep_rs_sum/float(t) reward_per_epi.append(ep_rs_sum) durations_per_epi.append(t) l_A.append(np.mean(actor._loss_)) l_C.append(np.mean(critic._loss_)) #print("episode:", i_episode, " reward:", ep_rs_sum) #plot(reward_per_epi, durations_per_epi, l_A, l_C) break return reward_per_epi, durations_per_epi, l_A, l_C
def start_p(GAME_NAME, BATCH_SIZE=32, MEMORY_CAPACITY=50000): env = gym.make(GAME_NAME) actor = Actor(env.observation_space, env.action_space) critic = Critic(env.observation_space, env.action_space) reward_per_epi = [] durations_per_epi = [] l_A = [] l_C = [] MAX_EPISODE = 200 RENDER = False MAX_EP_STEPS = 1000 DISPLAY_REWARD_THRESHOLD = 200 BATCH_SIZE = BATCH_SIZE MEMORY_CAPACITY = MEMORY_CAPACITY replay_memory = SumTreeMemoryBuffer(MEMORY_CAPACITY) #print "begin.\n\n" for i_episode in range(MAX_EPISODE): s = env.reset() track_r = [] critic._v_ = [] # clean critic loss buffer actor._loss_ = [] # clean actor loss buffer for t in count(): if RENDER: env.render() a = actor.choose_action(s) s_, r, done, info = env.step(a) ##if done: r = -20 # Penalty if die track_r.append(r) # ACER: Critic Actor with Experience Replay if not done: transition = np.hstack((s, a, r, s_)) replay_memory.save(transition) # Save non-final transition #print len(replay_memory) #print replay_memory.data #print replay_memory.gettree if len(replay_memory ) >= BATCH_SIZE: # memory capacity into batch size tree_idx, batch, ISWeights = replay_memory.sample( BATCH_SIZE) # Sample from memory s_b = np.asarray(batch[-1, 0:8]) # state s_b_n = np.asarray(batch[-1, 10:18]) # next state a_b = np.asarray(batch[-1, 8]) # action r_b = np.asarray(batch[-1, 9]) # reward # print("tree_idx: " + str(tree_idx)) #print(ISWeights) td_error, abs_error = critic.learn(s_b, r_b, s_b_n, ISWeights) # Critic Learn replay_memory.batch_update(tree_idx, abs_error) # Update T priority actor.learn(s_b, a_b, td_error) # Actor Learn # print("rd_error: " + str(td_error)) print("abs_error: " + str(abs_error)) s = s_ # print "... in episode (%d) step (%d)" % (i_episode+1,t) if is_ipython: display.clear_output(wait=True) display.display(plt.gcf()) #env.render() if done or t >= MAX_EP_STEPS: ep_rs_sum = sum(track_r) # if 'running_reward' not in globals(): # running_reward = ep_rs_sum # else: # running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering running_reward_avg = ep_rs_sum / float(t) reward_per_epi.append(ep_rs_sum) durations_per_epi.append( running_reward_avg) ## draw average reward here l_A.append(np.mean(actor._loss_)) l_C.append(np.mean(critic._loss_)) # print("episode:", i_episode, " reward:", ep_rs_sum) #plot(reward_per_epi, durations_per_epi, l_A, l_C) break return reward_per_epi, durations_per_epi, l_A, l_C
def start_er(GAME_NAME, BATCH_SIZE=32, MEMORY_CAPACITY=50000): #print ("make enviornment") env = gym.make(GAME_NAME) #print ("create actor, critic") actor = Actor(env.observation_space, env.action_space) critic = Critic(env.observation_space, env.action_space) reward_per_epi = [] durations_per_epi = [] l_A = [] l_C = [] MAX_EPISODE = 200 RENDER = False MAX_EP_STEPS = 1000 DISPLAY_REWARD_THRESHOLD = 200 BATCH_SIZE = BATCH_SIZE MEMORY_CAPACITY = MEMORY_CAPACITY replay_memory_1 = ReplayMemory(MEMORY_CAPACITY) replay_memory_2 = ReplayMemory(MEMORY_CAPACITY) f_1 = BATCH_SIZE / 2 # define fraction for 2 buckets f_2 = BATCH_SIZE / 2 #print ("begin.\n") for i_episode in range(MAX_EPISODE): s = env.reset() track_r = [] critic._v_ = [] actor._loss_ = [] for t in count(): if RENDER: env.render() a = actor.choose_action(s) s_, r, done, info = env.step(a) track_r.append(r) if not done: replay_memory_1.save( s, a, r, s_) if r > 0 else replay_memory_2.save( s, a, r, s_) # Save non-final transition into memory #learn form memory if len(replay_memory_1) >= f_1 and len( replay_memory_2 ) >= f_2: # if positive D is enough, the other must as well transitions_1 = replay_memory_1.sample( f_1) # Sample from 2 buckets batch1 = Transition(*zip(*transitions_1)) transitions_2 = replay_memory_2.sample(f_2) batch2 = Transition(*zip(*transitions_2)) s_b = np.append(np.asarray(batch1.state), np.asarray(batch2.state), axis=0) s_b_n = np.append(np.asarray(batch1.next_state), np.asarray(batch2.next_state), axis=0) a_b = np.append(np.asarray(batch1.action).reshape(f_1, 1), np.asarray(batch2.action).reshape(f_2, 1), axis=0) r_b = np.append(np.asarray(batch1.reward).reshape(f_1, 1), np.asarray(batch2.reward).reshape(f_2, 1), axis=0) td_error, abs_error = critic.learn(s_b, r_b, s_b_n) # Critic Learn actor.learn(s_b, a_b, td_error) # Actor Learn s = s_ ##print ("... in episode (%d) step (%d)" % (i_episode+1,t)) if is_ipython: display.clear_output(wait=True) display.display(plt.gcf()) #env.render() if done or t >= MAX_EP_STEPS: # Episode finished, print results ep_rs_sum = sum(track_r) #if 'running_reward' not in globals(): # running_reward = ep_rs_sum #else: # running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 #if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering running_reward_avg = ep_rs_sum / float(t) reward_per_epi.append(ep_rs_sum) durations_per_epi.append(t) l_A.append(np.mean(actor._loss_)) l_C.append(np.mean(critic._loss_)) #print("episode:", i_episode, " reward:", ep_rs_sum) #plot(reward_per_epi, durations_per_epi, l_A, l_C) break return reward_per_epi, durations_per_epi, l_A, l_C