def compete_and_return_score_list(sess, agent1, agent2, num_games): e2 = 0.20 e3 = 0.05 agent1_to_start = False wList = [] for i in range(num_games): d = False j = 0 g = Connect4Game(announce_winner=True) agent1_to_start = not agent1_to_start agent1s_turn = not agent1_to_start while j < 50 and np.sum(open_actions(g)) > 0 and not d: agent1s_turn = not agent1s_turn j += 1 s = get_state(g) filter = open_actions(g) top = 3 # introduce some random behaviour otherwise two games will settle it randval = np.random.rand(1) if randval > e2: top = 1 elif randval > e3: top = 2 # a = rand_index_filter(filter) if agent1s_turn: allQ = sess.run(agent1.Qout, feed_dict={ agent1.inputs: s, agent1.keep_pct: 1 }) a = best_allowed_action(allQ, filter, top) else: allQ = sess.run(agent2.Qout, feed_dict={ agent2.inputs: s, agent2.keep_pct: 1 }) a = best_allowed_action(allQ, filter, top) g.play_piece(g.first_empty_row(a), a) d = g.current_state == Connect4Game.GAME_OVER if g.current_state == Connect4Game.GAME_OVER and g.winner is not None: if agent1s_turn: wList.append(-1) else: wList.append(1) else: wList.append(0) return wList
def duel_and_save_games(sess, agent1, agent2, duelname): agent1_to_start = False wList = [] for i in range(2): bList = [] d = False j = 0 g = Connect4Game(announce_winner=True) agent1_to_start = not agent1_to_start agent1s_turn = not agent1_to_start while j < 50 and np.sum(open_actions(g)) > 0 and not d: agent1s_turn = not agent1s_turn j += 1 s = get_state(g) filter = open_actions(g) if agent1s_turn: allQ = sess.run(agent1.Qout, feed_dict={ agent1.inputs: s, agent1.keep_pct: 1 }) a = best_allowed_action(allQ, filter, 1) else: allQ = sess.run(agent2.Qout, feed_dict={ agent2.inputs: s, agent2.keep_pct: 1 }) a = best_allowed_action(allQ, filter, 1) g.play_piece(g.first_empty_row(a), a) d = g.current_state == Connect4Game.GAME_OVER bList.append(copy.deepcopy(g.board_position)) if g.current_state == Connect4Game.GAME_OVER and g.winner is not None: if agent1s_turn: wList.append(-1) else: wList.append(1) else: wList.append(0) # save game save_game(bList, "c4games/{0}_game{1}.txt".format(duelname, i + 1)) return wList
def on_update(self, delta_time: float): if self.current_game.current_player == self.champion_player_id \ and self.current_game.current_state == Connect4Game.GAME_RUNNING: # CHAMPION TO PLAY s = get_state(self.current_game) action_filter = open_actions(self.current_game) all_q = self.champion_agent.qnetwork.model.predict(s) a = best_allowed_action(all_q, action_filter, 1) self.play_piece(self.current_game.first_empty_row(a), a)
def compete_and_return_score_list(agent1, agent2, num_games): e2 = 0.20 e3 = 0.05 agent1_to_start = False w_list = [] for episode_no in range(num_games): d = False episode_len = 0 g = Connect4Game(announce_winner=True) agent1_to_start = not agent1_to_start agent1s_turn = not agent1_to_start while episode_len < max_num_step and np.sum( open_actions(g)) > 0 and not d: agent1s_turn = not agent1s_turn episode_len += 1 s = get_state(g) action_filter = open_actions(g) top = 3 # introduce some random behaviour otherwise two games will settle it rand_val = np.random.rand(1) if rand_val > e2: top = 1 elif rand_val > e3: top = 2 # a = rand_index_filter(filter) if agent1s_turn: all_q = agent1.qnetwork.predict(s) a = best_allowed_action(all_q, action_filter, top) else: all_q = agent2.qnetwork.predict(s) a = best_allowed_action(all_q, action_filter, top) g.play_piece(g.first_empty_row(a), a) d = g.current_state == Connect4Game.GAME_OVER if g.current_state == Connect4Game.GAME_OVER and g.winner is not None: if agent1s_turn: w_list.append(-1) else: w_list.append(1) else: w_list.append(0) return w_list
def on_update(self, delta_time: float): if self.current_game.current_player == self.champion_player_id and self.current_game.current_state == Connect4Game.GAME_RUNNING: # CHAMPION TO PLAY s = get_state(self.current_game) filter = open_actions(self.current_game) allQ = self.sess.run(self.champion_agent.Qout, feed_dict={ self.champion_agent.inputs: s, self.champion_agent.keep_pct: 1 }) a = best_allowed_action(allQ, filter, 1) self.play_piece(self.current_game.first_empty_row(a), a)
def train_agent_against_list(agent, opponents, episode_start_count=0): # create lists to contain total rewards and steps per episode jList = [] rList = [] agents_turn_to_start = False for i in range(num_episodes): # before adding episode_start_count training converges after approximately 60 generations #TODO: invent decreasing expression that approaches 0 and not 0.08 after 5000 episodes e = e_init * 1. / log((i + episode_start_count) / 10 + exp(1)) e2 = 0.30 e3 = 0.10 # Reset environment and get first new observation g = Connect4Game(announce_winner=True) rAll = 0 j = 0 op_idx = -1 if len(opponents) > 0: op_idx = np.random.randint(len(opponents)) agents_turn_to_start = not agents_turn_to_start # agents_turn = not agents_turn_to_start if op_idx > -1 and not agents_turn_to_start: opponent = opponents[op_idx] # let opponent play first move filter = open_actions(g) s = get_state(g) allQopp = opponent.model.predict(s) top = 3 # introduce some random behaviour, deterministic player is too easy to learn to beat randval = np.random.rand(1) if randval > e2: top = 1 elif randval > e3: top = 2 a = best_allowed_action(allQopp, filter, top) g.play_piece(g.first_empty_row(a), a) # The game training while j < 100 and np.sum( open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING: targetQ = np.zeros((1, Connect4Game.COLUMN_COUNT)) original_a = -1 # agent if np.sum( open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING: j += 1 filter = open_actions(g) s = get_state(g) # Choose an action allQ = agent.model.predict(s) if np.random.rand(1) < e: # full random a = rand_index_filter(filter) original_a = a else: # greedy a = best_allowed_action(allQ, filter, 1) original_a = np.argmax(allQ, axis=1) # initialize target targetQ = allQ # Get new state and reward from environment #if g.first_empty_row(a) < 0: # continue # targetQ[0, a] = 0 # penalty for trying to play outside board # r = 0 #else: g.play_piece(g.first_empty_row(a), a) # s1 = get_state(g) r = get_reward(g) # set expectations in case opponent is not allowed to play or do not exist maxQ1 = get_max_future_reward_previous_player(g) # then opponent if op_idx > -1 and np.sum( open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING: opponent = opponents[op_idx] j += 1 filter = open_actions(g) op_s = get_state(g) allQopp = opponent.model.predict(op_s) top = 3 # introduce some random behaviour, deterministic player is too easy to learn to beat randval = np.random.rand(1) if randval > e2: top = 1 elif randval > e3: top = 2 op_a = best_allowed_action(allQopp, filter, top) g.play_piece(g.first_empty_row(op_a), op_a) op_rew = get_reward(g) r = r - 0.9 * op_rew # update expectations in case opponent was allowed to move maxQ1 = get_max_future_reward_current_player(g) # update after opponent have played if original_a != a: targetQ[0, original_a] = np.min( targetQ) # this improved game understanding a lot targetQ[0, a] = r + y * maxQ1 # Train our network using target and predicted Q values # Changed from s1 to s _ = agent.model.fit(s, targetQ, epochs=1, verbose=0) rAll += r jList.append(j) rList.append(rAll) if (i + 1) % 5 == 0: print( "Training {0} Episodes: {1} E: {2:.3f} J: {3:.3f} R: {4:.3f}" .format(agent.name, i + 1 + episode_start_count, e, np.mean(jList), np.mean(rList))) jList = [] rList = []
def train_agent_against_list(agent, opponents, num_games, episode_start_count=0): # create lists to contain total rewards and steps per episode episode_len_list = [] r_list = [] loss_list = [] mae_list = [] agents_turn_to_start = False for episode_no in range(num_games): action_list = [] opp_action_list = [] reward_list = [] state_list = [] target_q_list = [] e = e_init - (e_init - e_end) / annealing_steps * (episode_no + 1 + episode_start_count) e2 = 0.20 e3 = 0.05 # Reset environment and get first new observation g = Connect4Game(announce_winner=True) episode_len = 0 opp_idx = -1 if len(opponents) > 0: opp_idx = np.random.randint(len(opponents)) agents_turn_to_start = not agents_turn_to_start if opp_idx > -1 and not agents_turn_to_start: opponent = opponents[opp_idx] # let opponent play first move s = get_state(g) all_q_opp = opponent.qnetwork.predict(s) top = 3 # introduce some random behaviour, deterministic player is too easy to learn to beat rand_val = np.random.rand(1) if rand_val > e2: top = 1 elif rand_val > e3: top = 2 opp_a = best_allowed_action(all_q_opp, open_actions(g), top) opp_action_list.append(opp_a) episode_len += 1 g.play_piece(g.first_empty_row(opp_a), opp_a) # The game training while episode_len < max_num_step and np.sum( open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING: r = 0 # agent if np.sum( open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING: episode_len += 1 s = get_state(g) state_list.append(s) # Choose an action all_q = agent.qnetwork.predict(s) if np.random.rand(1) < e: # full random a = rand_index_filter(open_actions(g)) original_a = a else: # greedy a = best_allowed_action(all_q, open_actions(g), 1) original_a = np.argmax(all_q, axis=1) action_list.append(a) # initialize target target_q_list.append(all_q) # update target if non-open action was originally preferred if original_a != a: # this update improved game understanding a lot target_q_list[-1][0][original_a] = np.min( target_q_list[-1]) g.play_piece(g.first_empty_row(a), a) # initiate reward r = get_reward(g) # then opponent if opp_idx > -1 and np.sum( open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING: opponent = opponents[opp_idx] episode_len += 1 opp_s = get_state(g) all_q_opp = opponent.qnetwork.predict(opp_s) top = 3 # introduce some random behaviour, deterministic player is too easy to learn to beat rand_val = np.random.rand(1) if rand_val > e2: top = 1 elif rand_val > e3: top = 2 opp_a = best_allowed_action(all_q_opp, open_actions(g), top) opp_action_list.append(opp_a) g.play_piece(g.first_empty_row(opp_a), opp_a) opp_rew = get_reward(g) # update reward after opponent move r = r - 0.9 * opp_rew reward_list.append(r) # update entire target with discounted rewards once game is over dr = discount_rewards(reward_list, gamma=y) for a_id in range(len(action_list)): target_q_list[a_id][0][action_list[a_id]] = dr[a_id] if gen_count <= experience_run_in: train_states_ready, train_target_q_ready = dup_mirror_input( np.stack(state_list, axis=0), np.concatenate(target_q_list)) if gen_count > experience_run_in - pre_buffer: # build initial experience buffer shared_experience.add_from_lists(action_list, state_list, dr) else: shared_experience.add_from_lists(action_list, state_list, dr) train_batch = shared_experience.sample(batch_size) # Separate the batch into its components train_states = np.stack(train_batch[:, 0].tolist(), axis=0) train_actions = train_batch[:, 1] train_rewards = train_batch[:, 2] # obtain new refreshed targetQ's train_target_q = agent.qnetwork.predict(train_states) for a_id in range(len(train_actions)): train_target_q[a_id, train_actions[a_id]] = train_rewards[a_id] train_states_ready, train_target_q_ready = dup_mirror_input( np.vstack([train_states, np.stack(state_list, axis=0)]), np.vstack([train_target_q, np.concatenate(target_q_list)])) # train network using target and predicted Q values after each game with discounted reward loss, mae = agent.qnetwork.train_on_batch(train_states_ready, train_target_q_ready) loss_list.append(loss) mae_list.append(mae) episode_len_list.append(episode_len) r_list.append(sum(dr) / episode_len) if (episode_start_count + episode_no + 1) % print_interval == 0: s = "Training {0} Episodes: {1} E: {2:.3f} L: {3:.3f} R: {4:.3f} Loss:{5:.3f} MAE:{6:.3f} Buffer:{7}" print( s.format(agent.name, episode_no + 1 + episode_start_count, e, np.mean(episode_len_list), np.mean(r_list), np.mean(loss), np.mean(mae), len(shared_experience.buffer)))
def train_agent(sess, agent, opponent): # create lists to contain total rewards and steps per episode jList = [] rList = [] agents_turn_to_start = False for i in range(num_episodes): e = e_init * 1. / log(i / 10 + exp(1)) e2 = 0.30 e3 = 0.10 # Reset environment and get first new observation g = Connect4Game(announce_winner=True) rAll = 0 j = 0 agents_turn_to_start = not agents_turn_to_start # agents_turn = not agents_turn_to_start if opponent is not None and not agents_turn_to_start: # let opponent play first move filter = open_actions(g) s = get_state(g) allQopp = sess.run(opponent.Qout, feed_dict={ opponent.inputs: s, opponent.keep_pct: 1 }) top = 3 # introduce some random behaviour, deterministic player is too easy to learn to beat randval = np.random.rand(1) if randval > e2: top = 1 elif randval > e3: top = 2 a = best_allowed_action(allQopp, filter, top) g.play_piece(g.first_empty_row(a), a) # The game training while j < 100 and np.sum( open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING: targetQ = np.zeros((1, Connect4Game.COLUMN_COUNT)) # agent if np.sum( open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING: j += 1 filter = open_actions(g) s = get_state(g) # Choose an action allQ = sess.run(agent.Qout, feed_dict={ agent.inputs: s, agent.keep_pct: 1 }) if np.random.rand(1) < e: # full random a = rand_index_filter(filter) else: # greedy a = best_allowed_action(allQ, filter, 1) # initialize target targetQ = allQ # Get new state and reward from environment #if g.first_empty_row(a) < 0: # continue # targetQ[0, a] = 0 # penalty for trying to play outside board # r = 0 #else: g.play_piece(g.first_empty_row(a), a) # s1 = get_state(g) r = get_reward(g) # set expectations in case opponent is not allowed to play or do not exist maxQ1 = get_max_future_reward_previous_player(g) # then opponent if opponent is not None and np.sum( open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING: j += 1 filter = open_actions(g) op_s = get_state(g) allQopp = sess.run(opponent.Qout, feed_dict={ opponent.inputs: op_s, opponent.keep_pct: 1 }) top = 3 # introduce some random behaviour, deterministic player is too easy to learn to beat randval = np.random.rand(1) if randval > e2: top = 1 elif randval > e3: top = 2 op_a = best_allowed_action(allQopp, filter, top) g.play_piece(g.first_empty_row(op_a), op_a) op_rew = get_reward(g) r = r - 0.9 * op_rew # update expectations in case opponent was allowed to move maxQ1 = get_max_future_reward_current_player(g) # update after opponent have played targetQ[0, a] = r + y * maxQ1 # Train our network using target and predicted Q values # Changed from s1 to s _ = sess.run(agent.updateModel, feed_dict={ agent.inputs: s, agent.keep_pct: 1, agent.nextQ: targetQ }) rAll += r jList.append(j) rList.append(rAll) if (i + 1) % 100 == 0: print( "Training {0} Episodes: {1} E: {2:.3f} J: {3:.3f} R: {4:.3f}" .format(agent.name, i + 1, e, np.mean(jList), np.mean(rList))) jList = [] rList = []
def train_agent_against_list(sess, agent, opponents, episode_start_count=0): # create lists to contain total rewards and steps per episode jList = [] rList = [] agents_turn_to_start = False for i in range(num_episodes): action_list = [] reward_list = [] # before adding episode_start_count training converges after approximately 60 generations e = e_init - (e_init - e_end) / e_steps * (i + 1 + episode_start_count) e2 = 0.20 e3 = 0.05 # Reset environment and get first new observation g = Connect4Game(announce_winner=True) rAll = 0 j = 0 op_idx = -1 if len(opponents) > 0: op_idx = np.random.randint(len(opponents)) agents_turn_to_start = not agents_turn_to_start # agents_turn = not agents_turn_to_start if op_idx > -1 and not agents_turn_to_start: opponent = opponents[op_idx] # let opponent play first move filter = open_actions(g) s = get_state(g) allQopp = sess.run(opponent.Qout, feed_dict={ opponent.inputs: s, opponent.keep_pct: 1 }) top = 3 # introduce some random behaviour, deterministic player is too easy to learn to beat randval = np.random.rand(1) if randval > e2: top = 1 elif randval > e3: top = 2 a = best_allowed_action(allQopp, filter, top) g.play_piece(g.first_empty_row(a), a) states = np.zeros((1, board_size * 3)) targetQ = np.zeros((1, Connect4Game.COLUMN_COUNT)) # The game training while j < 100 and np.sum( open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING: temp_states = states states = np.zeros((len(action_list) + 1, board_size * 3)) temp_targetQ = targetQ targetQ = np.zeros( (len(action_list) + 1, Connect4Game.COLUMN_COUNT)) if len(action_list) > 0: states[:-1, :] = temp_states targetQ[:-1, :] = temp_targetQ original_a = -1 # agent if np.sum( open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING: j += 1 filter = open_actions(g) s = get_state(g) states[len(action_list), :] = s # Choose an action allQ = sess.run(agent.Qout, feed_dict={ agent.inputs: s, agent.keep_pct: 1 }) if np.random.rand(1) < e: # full random a = rand_index_filter(filter) original_a = a else: # greedy a = best_allowed_action(allQ, filter, 1) original_a = np.argmax(allQ, axis=1) action_list.append(a) # initialize target targetQ[len(action_list) - 1, :] = allQ # Get new state and reward from environment #if g.first_empty_row(a) < 0: # continue # targetQ[0, a] = 0 # penalty for trying to play outside board # r = 0 #else: g.play_piece(g.first_empty_row(a), a) # s1 = get_state(g) r = get_reward(g) # set expectations in case opponent is not allowed to play or do not exist #maxQ1 = get_max_future_reward_previous_player(g) # then opponent if op_idx > -1 and np.sum( open_actions(g)) > 0 and g.current_state == g.GAME_RUNNING: opponent = opponents[op_idx] j += 1 filter = open_actions(g) op_s = get_state(g) allQopp = sess.run(opponent.Qout, feed_dict={ opponent.inputs: op_s, opponent.keep_pct: 1 }) top = 3 # introduce some random behaviour, deterministic player is too easy to learn to beat randval = np.random.rand(1) if randval > e2: top = 1 elif randval > e3: top = 2 op_a = best_allowed_action(allQopp, filter, top) g.play_piece(g.first_empty_row(op_a), op_a) op_rew = get_reward(g) r = r - 0.9 * op_rew # update expectations in case opponent was allowed to move #maxQ1 = get_max_future_reward_current_player(g) # update after opponent have played if original_a != a: targetQ[len(action_list) - 1, original_a] = np.min( targetQ[len(action_list) - 1, :]) # this improved game understanding a lot reward_list.append(r) #targetQ[0, a] = r + y * maxQ1 rAll += r dr = discount_rewards(reward_list) for a_id in range(len(action_list)): targetQ[a_id, action_list[a_id]] = dr[a_id] states, targetQ = dup_mirror_input(states, targetQ) # Train our network using target and predicted Q values after each game with discounted reward _ = sess.run(agent.updateModel, feed_dict={ agent.inputs: states, agent.keep_pct: 1, agent.nextQ: targetQ }) jList.append(j) rList.append(sum(dr) / j) if (i + 1) % 50 == 0: sess.run(agent.training_episodes.assign_add(50)) #ep = sess.run(agent.training_episodes, feed_dict={}) #_ = sess.run(agent.training_episodes, feed_dict={agent.training_episodes: ep + 5}) print( "Training {0} Episodes: {1} E: {2:.3f} J: {3:.3f} R: {4:.3f}" .format(agent.name, i + 1 + episode_start_count, e, np.mean(jList), np.mean(rList))) jList = [] rList = []