Python DeepQNetwork.choose_action示例

编程语言: Python

命名空间/包名称: dqn

类/类型: DeepQNetwork

方法/功能: choose_action

hotexamples.com的示例: 10

Python DeepQNetwork.choose_action - 已找到10个示例。这些是从开源项目中提取的最受好评的dqn.DeepQNetwork.choose_action现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

DeepQNetwork(30)

choose_action(10)

forward(5)

buildNetwork(1)

create_model(1)

eval(1)

getFC7(1)

get_hourly_actions(1)

get_hourly_stocks(1)

inference(1)

示例#1

显示文件

文件： simple_dqn.py 项目： tornadoyi/gamescholar

def run(render):
    net = DeepQNetwork(sess,
                       N_A, N_S,
                       learning_rate=0.01,
                       reward_decay=0.9,
                       e_greedy=0.9,
                       replace_target_iter=200,
                       memory_size=2000,
                       scope='dqn_{0}'.format(0),
                       # output_graph=True
                       )

    sess.run(tf.global_variables_initializer())


    step = 0
    for episode in range(300):
        # initial observation
        s = env.reset()

        while True:
            # RL choose action based on observation
            a, q = net.choose_action(s)


            # RL take action and get next observation and reward
            s_, r, d, _ = env.step(a)
            if render: env.render()

            #print('rewards: {0}'.format(r))

            net.store_transition(s, a, r, s_)


            if (step > 200) and (step % 5 == 0):
                net.learn()

            # swap observation
            s = s_

            # break while loop when end of this episode
            if d:
                break
            step += 1

示例#2

显示文件

文件： demo_cartpole.py 项目： XinJCheng/Basic-DRL-tensorflow

    e_greedy = 0.9,
    replace_target_iter = 100,
    memory_size = 2000,
    e_greedy_increment = 0.001
    )

total_steps = 0

for i_episode in range(100):

    observation = env.reset()
    ep_r = 0
    while True:
        env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        # change to a more reasonable reward function
        x, x_dot, theta, theta_dot = observation_
        r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
        r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
        reward = r1 + r2

        RL.store_transition(observation, action, reward, observation_)

        ep_r += reward
        if total_steps > 1000:
            RL.learn()

示例#3

显示文件

        observation = env.reset()
        state_.append(observation)
        # observation = np.identity(16)[observation:observation + 1]
        # observation = np.expand_dims(observation, axis=2)
        # observation = np.expand_dims(observation, axis=3)

        # observation = rgb2gray(observation)
        score = 0

        while True:

            env.render()

            action = dqn.choose_action(
                np.expand_dims(np.array(list(state)), axis=2),
                True if counter < n_width else False)
            # action = dqn.choose_action(observation)
            # action = dqn.choose_action(np.reshape(observation, (1, 3, 1)))
            f_action = (action - (n_action - 1) / 2) / ((n_action - 1) / 4)

            observation_, reward, done, info = env.step(np.array([f_action]))

            reward = reward / 10
            # observation_ = np.identity(16)[observation_:observation_ + 1]
            # observation_ = np.expand_dims(observation_, axis=2)
            # observation_ = np.expand_dims(observation_, axis=3)
            # observation_ = rgb2gray(observation_)

            score += reward

示例#4

显示文件

def run(render):
    nets = []
    for i in range(4):
        net = DeepQNetwork(
            sess,
            N_A,
            N_S,
            learning_rate=0.01,
            reward_decay=0.9,
            e_greedy=0.9,
            replace_target_iter=200,
            memory_size=2000,
            scope='dqn_{0}'.format(i),
            # output_graph=True
        )
        nets.append(net)

    sess.run(tf.global_variables_initializer())

    step = 0
    for episode in range(300):
        # initial observation
        s = env.reset()

        while True:
            # fresh env
            if render: env.render()

            # RL choose action based on observation
            sum = np.zeros(4, dtype=float)
            count = np.zeros(4, dtype=float)

            for i in range(len(nets)):
                net = nets[i]
                a, q = net.choose_action(s)
                sum[a] += W[i] * q
                count[a] += 1

            sum[count > 0] = sum[count > 0] / count[count > 0]
            a = np.argmax(sum)

            print('mean: {0}'.format(sum))

            # RL take action and get next observation and reward
            s_, r, d, _ = env.step(a)

            print('rewards: {0}'.format(r))

            for i in range(len(nets)):
                net = nets[i]
                net.store_transition(s, a, r[i], s_)

            if (step > 200) and (step % 5 == 0):
                for net in nets:
                    net.learn()

            # swap observation
            s = s_

            # break while loop when end of this episode
            if d:
                break
            step += 1

示例#5

显示文件

    episodes = 20
    dqn = DeepQNetwork(env.action_space.n,
                       episodes=episodes,
                       observation_space=env.observation_space.n)
    episode_reward_np_array = np.zeros(episodes)
    random_probability = 1

    for episode in range(episodes):
        done = False
        episode_reward = 0
        current_state = env.reset()
        step_counter = 0
        while not done:
            dqn.set_random_probability(random_probability)
            # os.system("clear")
            action = dqn.choose_action(current_state)
            next_state, reward, done, info = env.step(action)

            dqn.store_experience(current_state, action, reward, next_state,
                                 done)

            if step_counter > 100 and step_counter % 5 == 0:
                dqn.learn()

            current_state = next_state

            episode_reward += reward
            step_counter += 1

            # env.render()
            # time.sleep(0.1)

示例#6

显示文件

class trainer():
    def __init__(self, station_history):

        # Session Properties
        self.episodes = []
        self.stock_type = ""
        self.logging = False
        self.env_debug = False
        self.rl_debug = False
        self.bike_station = None
        self.operator = None
        self.sim_stock = []
        self.model_based = False
        self.ID = None
        self.method = None
        self.station_history = station_history

        # Performance Metric
        self.success_ratio = 0
        self.rewards = []  # [[r from session 1], [r from session 2] ...]
        self.avg_rewards = [
        ]  #[np.mean([r from session 1]), np.mean([r from session 2])...]
        self.final_stocks = [
        ]  # [[stock from session 1], [stock from session 2] ...]
        self.episode_action_history = []
        self.episode_stock_history = []
        self.session_action_history = []
        self.session_stock_history = []
        self.q_tables = []
        self.actions = [-10, -3, -1, 0]

    def start(self, episodes, stock_type, logging, env_debug, rl_debug, brain,
              ID, model_based):
        #brain: which method to use. Q learning vs DQN

        self.episodes = episodes
        self.stock_type = stock_type
        self.logging = logging
        self.env_debug = env_debug
        self.rl_debug = rl_debug
        self.brain = brain
        self.ID = ID
        self.model_based = model_based

        if brain == 'q' and model_based == False:
            self.method = 'QLN'
        elif brain == 'q' and model_based == True:
            self.method = 'FCT'
        else:
            self.method = 'DQN'

        idx = 0

        for eps in self.episodes:

            # Initiate new evironment and RL agent
            self.bike_station = env(self.stock_type,
                                    debug=self.env_debug,
                                    ID=self.ID,
                                    station_history=self.station_history)
            self.sim_stock.append(self.bike_station.get_sim_stock())

            if self.brain == 'q':
                self.operator = agent(
                    epsilon=0.9,
                    lr=0.01,
                    gamma=0.9,
                    current_stock=self.bike_station.current_stock(),
                    debug=self.rl_debug,
                    expected_stock=self.bike_station.get_expected_stock(),
                    model_based=model_based)
            elif self.brain == 'dqn':
                self.operator = DeepQNetwork(self.bike_station.n_actions,
                                             self.bike_station.n_features,
                                             0.01, 0.9)
            else:
                print("Error: pick correct brain")
                break

            # Train the RL agent and collect performance stats
            rewards, final_stocks = self.train_operator(
                idx,
                len(self.episodes),
                eps,
                logging=self.logging,
                brain=self.brain,
                model_based=self.model_based)

            # Log the results from this training session
            self.rewards.append(rewards)
            self.avg_rewards.append(np.mean(rewards))
            self.final_stocks.append(final_stocks)
            #self.q_tables.append(self.operator.get_q_table())
            self.session_action_history.append(self.episode_action_history)
            self.session_stock_history.append(self.episode_stock_history)
            self.reset_episode_history()

            # Destroy the environment and agent objects
            self.bike_station = None
            self.operator = None

            idx += 1

        if logging == True:
            if self.brain == 'q':
                self.save_session_results(self.get_timestamp(replace=True))
            else:
                self.save_session_results_dqn(self.get_timestamp(replace=True))

        return

    def train_operator(self, idx, num_sessions, episodes, logging, brain,
                       model_based):
        '''
        This function trains an RL agent by interacting with the bike station 
        environment. It also tracks and reports performance stats.
        Input:
            - episodes: a int of episode to be trained in this session (e.g. 500)
        Output:
            - reward_list: a list of reward per episode in this sesison
            - final_stocks: a list of final stocks per episode in this session
        '''

        print("Start training the Agent ...")
        rewards = 0
        reward_list = []
        final_stocks = []
        step = 0

        for eps in range(episodes):

            self.bike_station.reset()

            while True:

                # Agent picks an action (number of bikes to move)
                # Agent sends the action to bike station environment
                # Agent gets feedback from the environment (e.g. reward of the action, new bike stock after the action, etc.)
                # Agent "learn" the feedback by updating its Q-Table (state, action, reward)
                # Repeat until end of day (23 hours)
                # Reset bike station environment to start a new day, repeat all

                if self.brain == 'q':
                    action = self.operator.choose_action(
                        self.bike_station.get_old_stock(),
                        self.bike_station.get_expected_stock())
                    current_hour, old_stock, new_stock, expected_stock, _, reward, done, game_over = self.bike_station.ping(
                        action)

                else:
                    action = self.operator.choose_action(
                        self.bike_station.get_old_stock())
                    current_hour, old_stock, new_stock, reward, done = self.bike_station.ping_dqn(
                        action)
                    self.operator.store_transition(old_stock, action, reward,
                                                   new_stock)
                    if step > 50 and (step % 10 == 0):
                        self.operator.learn()

                #observation_, reward, done = self.bike_station.ping(action)
                if done == True:

                    print(
                        "{} of {} Session | Episode: {} | Final Stock: {} |Final Reward: {:.2f}"
                        .format(idx, num_sessions, eps, old_stock, rewards))

                    reward_list.append(rewards)
                    final_stocks.append(old_stock)
                    rewards = 0

                    # Log session action history by episode
                    if brain == 'q':
                        self.episode_action_history.append(
                            self.operator.get_hourly_actions())
                        self.episode_stock_history.append(
                            self.operator.get_hourly_stocks())
                        self.operator.reset_hourly_history()
                    else:
                        self.episode_stock_history.append(
                            self.operator.get_hourly_stocks())
                        self.operator.reset_hourly_history()

                    break

                if brain == 'q':

                    self.operator.learn(old_stock, action, reward, new_stock,
                                        expected_stock, game_over)

                step += 1
                rewards += reward

                # Log hourly action history by each episode

            with open('dqn_log.txt', 'a') as f:
                f.write(
                    "{} of {} Session | Episode: {} | Final Stock: {} |Final Reward: {:.2f} \n"
                    .format(idx, num_sessions, eps, old_stock, rewards))

        return reward_list, final_stocks

    def get_timestamp(self, replace):

        if replace == True:

            return str(datetime.datetime.now()).replace(" ", "").replace(":", "").\
                        replace(".", "").replace("-", "")

        else:

            return str(datetime.datetime.now())

    def reset_episode_history(self):

        self.episode_action_history = []
        self.episode_stock_history = []

    def cal_performance(self):

        successful_stocking = []

        print("===== Performance =====")

        for session in range(len(self.final_stocks)):
            length = len(self.final_stocks[session])
            num_overstock = np.count_nonzero(
                np.array(self.final_stocks[session]) > 50)
            num_understock = np.count_nonzero(
                np.array(self.final_stocks[session]) <= 0)
            ratio = (length - num_understock - num_overstock) * 100 / length

            print(
                "Session {} | Overstock {} Times | Understock {} Times | {}% Successful"
                .format(session, num_overstock, num_understock, ratio))

            average_reward = round(self.avg_rewards[session], 2)
            print("Average Episode Reward for Session: {}".format(
                average_reward))

            successful_stocking.append(ratio)

        return successful_stocking

    def save_session_results(self, timestamp):
        '''
        This function logs the following: 
            - overall success ratio of each session
            - line chart of success ratio by session
            - line chart of reward history by session
            - Q Table of each session
            - Comparison Line Chart of First and Last Episode Hourly Actions
        '''

        # --- create a session folder ---
        dir_path = "./performance_log/" + timestamp

        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        successful_stocking = self.cal_performance()

        # --- Write Success Rate to File ---
        fname = dir_path + "/success_rate - " + timestamp + ".txt"

        with open(fname, 'w') as f:

            f.write("Logged at {}".format(self.get_timestamp(replace=False)))
            f.write("\n")
            f.write("This training session ran episodes: {}".format(
                self.episodes))
            f.write("\n")

            for session in range(len(successful_stocking)):
                f.write(
                    "Session {} | Episodes: {} | Success Rate: {:.2f}%".format(
                        session, self.episodes[session],
                        successful_stocking[session]))
                f.write("\n")

        # --- Plot Overall Success Rate by Episode ---

        title = "% of Successful Rebalancing - " + timestamp

        fig1 = plt.figure()
        plt.plot(self.episodes, successful_stocking)
        plt.xlabel("Episodes")
        plt.ylabel("% Success Rate")
        plt.title(title)
        fig1.savefig(dir_path + "/session_success_rate_" + timestamp)

        # --- Plot Reward History by Training Session ---

        for session in range(len(self.rewards)):

            fig = plt.figure(figsize=(10, 8))

            title = "Reward History by Training Session " + str(
                session) + " - " + timestamp

            x_axis = [x for x in range(self.episodes[session])]
            plt.plot(x_axis,
                     self.rewards[session],
                     label="Session " + str(session))
            plt.legend()
            plt.xlabel("Episode")
            plt.ylabel("Reward")
            plt.title(title)
            fig.savefig(dir_path + "/reward_history_session_" + \
                        str(session) + timestamp)

        # --- Plot Average Reward History by Training Session ---
        figR = plt.figure(figsize=[10, 8])
        lengths = [len(r) for r in self.rewards]
        means = [np.mean(r) for r in self.rewards]
        if len(self.rewards) > 1:
            increment = (lengths[1] - lengths[0]) / 20
        else:
            increment = lengths[0] / 20

        for reward_list in self.rewards:
            Q3 = np.percentile(reward_list, 75)
            Q1 = np.percentile(reward_list, 25)
            M = np.mean(reward_list)
            location = len(reward_list)
            plt.plot([location - increment, location + increment], [Q1, Q1],
                     'k-')
            plt.plot([location - increment, location + increment], [Q3, Q3],
                     'k-')
            plt.plot([location, location], [Q1, Q3], 'k-')
            plt.scatter(location, M, s=100, color='dodgerblue')

        plt.xlabel('Number of Episodes in Session')
        plt.ylabel('Average Reward per Episode')
        plt.title('Average Reward vs. Session Size', size=20)
        plt.xticks(lengths)

        plt.plot(lengths, means, linestyle='--')

        figR.savefig(dir_path + "/reward_averages")

        # --- Save Q tables ---

        for session in range(len(self.q_tables)):

            self.q_tables[session].to_csv(dir_path + "/q_table_session_" + \
                        str(session) + timestamp + ".csv")

        # --- Comparison Line Chart of First and Last Episode for each Session ---

        file_path = dir_path + "/action_history"

        if not os.path.exists(file_path):
            os.makedirs(file_path)

        for session in range(len(self.session_action_history)):

            first_eps_idx = 0
            last_eps_idx = len(self.session_action_history[session]) - 1

            fig = plt.figure(figsize=(10, 8))
            title = "Session " + str(
                session) + " - Hourly Action of Eps " + str(
                    first_eps_idx) + " and Eps " + str(last_eps_idx)

            x_axis = [
                x for x in range(len(self.session_action_history[session][0]))
            ]
            plt.plot(x_axis,
                     self.session_action_history[session][0],
                     label="Eps 0")
            plt.plot(x_axis,
                     self.session_action_history[session][-1],
                     label="Eps " + str(last_eps_idx))

            plt.legend()
            plt.xlabel("Hours")
            plt.ylabel("Number of Bikes Moved")
            plt.title(title)

            fig.savefig(file_path + "/action_history_" + str(session) +
                        timestamp)

        # --- Comparison Line Chart of Simulated and Rebalanced Bike Stock --- #
        file_path = dir_path + "/stock_history"

        if not os.path.exists(file_path):
            os.makedirs(file_path)

        for session in range(len(self.session_stock_history)):

            first_eps_idx = 0
            last_eps_idx = len(self.session_action_history[session]) - 1

            fig = plt.figure(figsize=(10, 8))
            title = "[" + self.method + "]" + "Session " + str(
                session) + " - Original vs. Balanced Bike Stock after " + str(
                    first_eps_idx) + " and Eps " + str(last_eps_idx)

            x_axis = [
                x for x in range(len(self.session_stock_history[session][0]))
            ]
            plt.plot(x_axis,
                     self.sim_stock[session],
                     label="Original without Balancing")
            plt.plot(x_axis,
                     self.session_stock_history[session][0],
                     label="Balanced Bike Stock - Eps 0")
            plt.plot(x_axis,
                     self.session_stock_history[session][-1],
                     label="Balanced Bike Stock - Eps " + str(last_eps_idx))

            plt.axhline(y=50, c="r", ls="--", label="Upper Stock Limit")
            plt.axhline(y=0, c="r", ls="--", label="Lower Stock Limit")

            plt.legend()
            plt.xlabel("Hours")
            plt.ylabel("Number of Bike Stock")
            plt.title(title)

            fig.savefig(file_path + "/stock_history_" + str(session) +
                        timestamp)

        return

    def save_session_results_dqn(self, timestamp):
        dir_path = "./performance_log/" + timestamp

        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        # --- Comparison Line Chart of Simulated and Rebalaned Bike Stock --- #
        file_path = dir_path + "/stock_history"

        if not os.path.exists(file_path):
            os.makedirs(file_path)

        successful_stocking = self.cal_performance()

        # --- Write Success Rate to File ---
        fname = dir_path + "/success_rate - " + timestamp + ".txt"

        with open(fname, 'w') as f:

            f.write("Logged at {}".format(self.get_timestamp(replace=False)))
            f.write("\n")
            f.write("This training session ran episodes: {}".format(
                self.episodes))
            f.write("\n")

            for session in range(len(successful_stocking)):
                f.write(
                    "Session {} | Episodes: {} | Success Rate: {:.2f}%".format(
                        session, self.episodes[session],
                        successful_stocking[session]))
                f.write("\n")

            # --- Plot Overall Success Rate by Episode ---

        title = "% of Successful Rebalancing - " + timestamp

        fig1 = plt.figure()
        plt.plot(self.episodes, successful_stocking)
        plt.xlabel("Episodes")
        plt.ylabel("% Success Rate")
        plt.title(title)
        fig1.savefig(dir_path + "/session_success_rate_" + timestamp)

        for session in range(len(self.session_stock_history)):

            first_eps_idx = 0
            last_eps_idx = len(self.session_stock_history[session]) - 1

            fig = plt.figure(figsize=(10, 8))
            title = "[" + self.method + "]" + " Session " + str(
                session) + " - Original vs. Balanced Bike Stock after " + str(
                    first_eps_idx) + " and Eps " + str(last_eps_idx)

            x_axis = [
                x for x in range(len(self.session_stock_history[session][0]))
            ]
            plt.plot(x_axis,
                     self.sim_stock[session],
                     label="Original without Balancing")
            plt.plot(x_axis,
                     self.session_stock_history[session][0],
                     label="Balanced Bike Stock - Eps 0")
            plt.plot(x_axis,
                     self.session_stock_history[session][-1],
                     label="Balanced Bike Stock - Eps " + str(last_eps_idx))

            plt.axhline(y=50, c="r", ls="--", label="Upper Stock Limit")
            plt.axhline(y=0, c="r", ls="--", label="Lower Stock Limit")

            plt.legend()
            plt.xlabel("Hours")
            plt.ylabel("Number of Bike Stock")
            plt.title(title)

            fig.savefig(file_path + "/stock_history_" + "DQN" + str(session) +
                        timestamp)

        return

示例#7

显示文件

    agent.sess, r"saved model/mountain car/dqn/mountain car_dqn.ckpt")

# if dont want to train,set this
# agent.learn_threshold = 1e8

steps = []

for i_episode in range(20):
    total_steps = 0
    observation = env.reset()
    while True:
        # if i_episode > 10:
        #     env.render()

        # false means act deterministicly
        action = agent.choose_action(observation, True)

        observation_, reward, done, info = env.step(action)

        if done:
            reward = 10

        agent.store_transition(observation, action, reward, observation_, done)

        if done:
            print('episode', i_episode, total_steps)
            steps.append(total_steps)
            break

        observation = observation_
        total_steps += 1

示例#8

显示文件

class SmartAgent(object):
    def __init__(self):
        # from the origin base.agent
        self.reward = 0
        self.episodes = 0
        self.steps = 0
        self.obs_spec = None
        self.action_spec = None

        self.dqn = DeepQNetwork(
            len(smart_actions),
            10,  # one of the most important data that needs to be update manually
            learning_rate=0.001,
            reward_decay=0.9,
            e_greedy=0.9,
            replace_target_iter=200,
            memory_size=5000,
            batch_size=32,
            e_greedy_increment=None,
            output_graph=True)

        # self defined vars
        self.fighting = False
        self.player_hp = []
        self.enemy_hp = []
        self.previous_enemy_hp = []
        self.previous_player_hp = []
        self.leftover_enemy_hp = []
        self.win = 0
        self.count = 0

        self.previous_action = None
        self.previous_state = None

    def step(self, obs):
        # from the origin base.agent
        self.steps += 1
        self.reward += obs.reward

        current_state, enemy_hp, player_hp, enemy_loc, player_loc, distance, selected, enemy_count, player_count, player_cooldown = self.extract_features(
            obs)

        self.player_hp.append(sum(player_hp))
        self.enemy_hp.append(sum(enemy_hp))

        # scripted the few initial actions to increases the learning performance
        while not self.fighting:
            for i in range(0, player_count):
                if distance[i] < 20:
                    self.fighting = True
                    # return actions.FunctionCall(_NO_OP, [])

            return actions.FunctionCall(_ATTACK_SCREEN,
                                        [_NOT_QUEUED, enemy_loc[0]])

        # Default case => Select unit
        # select the unit that is closest to the enemy
        # if same distance, pick the one with lower hp
        # if same distance and hp, randomly select one
        closest_indices = []
        closest_index = distance.index(min(distance))

        for i in range(0, player_count):
            if distance[i] == distance[closest_index]:
                closest_indices.append(i)

        lowest_hp_indices = []
        lowest_hp_index = player_hp.index(min(player_hp))

        for i in range(0, player_count):
            if player_hp[i] == player_hp[lowest_hp_index]:
                lowest_hp_indices.append(i)

        common_indices = list(
            set(closest_indices).intersection(lowest_hp_indices))

        if len(common_indices) != 0:
            selected_index = random.choice(common_indices)
        elif len(closest_indices) != 0:
            selected_index = random.choice(closest_indices)
        else:
            selected_index = 0

        if selected[selected_index] == 0 or (selected[0] == 1
                                             and selected[1] == 1):
            return actions.FunctionCall(
                _SELECT_POINT, [_NOT_QUEUED, player_loc[selected_index]])

        rl_action = self.dqn.choose_action(np.array(current_state))
        smart_action = smart_actions[rl_action]

        # record the transitions to memory and learn by DQN
        if self.previous_action is not None:
            reward = self.get_reward(obs, distance, player_hp, enemy_hp,
                                     player_count, enemy_count, rl_action,
                                     selected, player_loc, enemy_loc,
                                     player_cooldown)

            self.dqn.store_transition(np.array(self.previous_state),
                                      self.previous_action, reward,
                                      np.array(current_state))

        self.previous_state = current_state
        self.previous_action = rl_action
        self.previous_enemy_hp = enemy_hp
        self.previous_player_hp = player_hp

        next_action = self.perform_action(obs, smart_action, player_loc,
                                          enemy_loc, selected, player_count,
                                          enemy_count, distance, player_hp)

        return next_action

    def get_reward(self, obs, distance, player_hp, enemy_hp, player_count,
                   enemy_count, rl_action, selected, unit_locs, enemy_loc,
                   player_cooldown):
        reward = 0.
        selected_index = -1

        for i in range(0, DEFAULT_PLAYER_COUNT):
            if selected[i] == 1:
                selected_index = i

        x = unit_locs[selected_index][0]
        y = unit_locs[selected_index][1]

        if distance[selected_index] < 6 or distance[selected_index] > 20:
            reward -= 1
        else:
            reward = distance[selected_index] / 20

        return reward

    # extract all the desired features as inputs for the DQN
    def extract_features(self, obs):
        var = obs.observation['feature_units']

        # get units' location and distance
        enemy, player = [], []

        # get health
        enemy_hp, player_hp, player_cooldown = [], [], []

        # record the selected army
        is_selected = []

        # unit_count
        enemy_unit_count, player_unit_count = 0, 0

        for i in range(0, var.shape[0]):
            if var[i][_UNIT_ALLIANCE] == _PLAYER_HOSTILE:
                enemy.append((var[i][_UNIT_X], var[i][_UNIT_Y]))
                enemy_hp.append(var[i][_UNIT_HEALTH] + var[i][_UNIT_SHIELD])
                enemy_unit_count += 1
            else:
                player.append((var[i][_UNIT_X], var[i][_UNIT_Y]))
                player_hp.append(var[i][_UNIT_HEALTH])
                is_selected.append(var[i][_UNIT_IS_SELECTED])
                player_cooldown.append((var[i][_UNIT_COOLDOWN]))
                player_unit_count += 1

        # append if necessary so that maintains fixed length for current state
        for i in range(player_unit_count, DEFAULT_PLAYER_COUNT):
            player.append((-1, -1))
            player_hp.append(0)
            player_cooldown.append(0)
            is_selected.append(-1)

        for i in range(enemy_unit_count, DEFAULT_ENEMY_COUNT):
            enemy.append((-1, -1))
            enemy_hp.append(0)

        # get distance
        min_distance = [100000 for x in range(DEFAULT_PLAYER_COUNT)]

        for i in range(0, player_unit_count):
            for j in range(0, enemy_unit_count):
                distance = int(
                    math.sqrt((player[i][0] - enemy[j][0])**2 +
                              (player[i][1] - enemy[j][1])**2))

                if distance < min_distance[i]:
                    min_distance[i] = distance

        # some new stuff to try
        player_units, enemy_units = [], []

        for i in range(0, var.shape[0]):
            if var[i][_UNIT_ALLIANCE] == _PLAYER_HOSTILE:
                unit = []
                unit.append(var[i][_UNIT_X])
                unit.append(var[i][_UNIT_Y])
                unit.append(var[i][_UNIT_HEALTH] + var[i][_UNIT_SHIELD])
                unit.append(var[i][_UNIT_COOLDOWN])

                enemy_units.append(unit)
            else:
                unit = []
                unit.append(var[i][_UNIT_X])
                unit.append(var[i][_UNIT_Y])
                unit.append(var[i][_UNIT_HEALTH])
                unit.append(var[i][_UNIT_COOLDOWN])
                unit.append(100000)  # default distance
                unit.append(var[i][_UNIT_IS_SELECTED])

                if var[i][_UNIT_IS_SELECTED] == 1:
                    player_units.append(unit)

                    if var[i][_UNIT_HEALTH] < 20:
                        self.count += 1

        # append if necessary so that maintains fixed length for current state
        for i in range(player_unit_count, 1):
            unit = [-1, -1, 0, 0, 100000, 0]
            player_units.append(unit)

        for i in range(enemy_unit_count, DEFAULT_ENEMY_COUNT):
            unit = [-1, -1, 0, 0]
            enemy_units.append(unit)

        for unit in player_units:
            for opponent in enemy_units:
                distance = int(
                    math.sqrt((unit[0] - opponent[0])**2 +
                              (unit[1] - opponent[1])**2))

                if distance < unit[4]:
                    unit[4] = distance

        # flatten the array so that all features are a 1D array
        feature1 = np.array(enemy_hp).flatten()  # enemy's hp
        feature2 = np.array(player_hp).flatten()  # player's hp
        feature3 = np.array(enemy).flatten()  # enemy's coordinates
        feature4 = np.array(player).flatten()  # player's coordinates
        feature5 = np.array(min_distance).flatten()  # distance
        feature6 = np.array(player_cooldown).flatten()

        feature7 = np.array(player_units).flatten()
        feature8 = np.array(enemy_units).flatten()

        # combine all features horizontally
        #current_state = np.hstack((feature1, feature2, feature3, feature4, feature5, feature6))
        current_state = np.hstack((feature7, feature8))

        return current_state, enemy_hp, player_hp, enemy, player, min_distance, is_selected, enemy_unit_count, player_unit_count, player_cooldown

    # make the desired action calculated by DQN
    def perform_action(self, obs, action, unit_locs, enemy_locs, selected,
                       player_count, enemy_count, distance, player_hp):
        index = -1

        for i in range(0, DEFAULT_PLAYER_COUNT):
            if selected[i] == 1:
                index = i

        x = unit_locs[index][0]
        y = unit_locs[index][1]

        if action == ATTACK_TARGET:
            if _ATTACK_SCREEN in obs.observation["available_actions"]:
                if enemy_count >= 1:
                    return actions.FunctionCall(
                        _ATTACK_SCREEN,
                        [_NOT_QUEUED, enemy_locs[0]])  # x,y => col,row

        elif action == MOVE_UP:
            if _MOVE_SCREEN in obs.observation[
                    "available_actions"] and index != -1:
                x = x
                y = y - 4

                if 3 > x:
                    x = 3
                elif x > 79:
                    x = 79

                if 3 > y:
                    y = 3
                elif y > 59:
                    y = 59

                return actions.FunctionCall(
                    _MOVE_SCREEN, [_NOT_QUEUED, [x, y]])  # x,y => col,row

        elif action == MOVE_DOWN:
            if _MOVE_SCREEN in obs.observation[
                    "available_actions"] and index != -1:
                x = x
                y = y + 4

                if 3 > x:
                    x = 3
                elif x > 79:
                    x = 79

                if 3 > y:
                    y = 3
                elif y > 59:
                    y = 59

                return actions.FunctionCall(_MOVE_SCREEN,
                                            [_NOT_QUEUED, [x, y]])

        elif action == MOVE_LEFT:
            if _MOVE_SCREEN in obs.observation[
                    "available_actions"] and index != -1:
                x = x - 4
                y = y

                if 3 > x:
                    x = 3
                elif x > 79:
                    x = 79

                if 3 > y:
                    y = 3
                elif y > 59:
                    y = 59

                return actions.FunctionCall(_MOVE_SCREEN,
                                            [_NOT_QUEUED, [x, y]])

        elif action == MOVE_RIGHT:
            if _MOVE_SCREEN in obs.observation[
                    "available_actions"] and index != -1:
                x = x + 4
                y = y

                if 3 > x:
                    x = 3
                elif x > 79:
                    x = 79

                if 3 > y:
                    y = 3
                elif y > 59:
                    y = 59

                return actions.FunctionCall(_MOVE_SCREEN,
                                            [_NOT_QUEUED, [x, y]])

        return actions.FunctionCall(_MOVE_SCREEN, [_NOT_QUEUED, [x, y]])

    def plot_hp(self, path, save):
        plt.plot(np.arange(len(self.player_hp)), self.player_hp)
        plt.ylabel('player hp')
        plt.xlabel('training steps')
        if save:
            plt.savefig(path + '/player_hp.png')
        plt.close()

        plt.plot(np.arange(len(self.enemy_hp)), self.enemy_hp)
        plt.ylabel('enemy hp')
        plt.xlabel('training steps')
        if save:
            plt.savefig(path + '/enemy_hp.png')
        plt.close()

        plt.plot(np.arange(len(self.leftover_enemy_hp)),
                 self.leftover_enemy_hp)
        plt.ylabel('enemy hp')
        plt.xlabel('Episodes')
        if save:
            plt.savefig(path + '/eval.png')
        plt.close()

        print("AVG ENEMY HP LEFT",
              sum(self.leftover_enemy_hp) / len(self.leftover_enemy_hp))
        print("Winning Rate: {0:.2f}%".format(
            float(self.win / (self.episodes - 1) * 100)))
        print("Low hp controlled steps", self.count)

    # from the origin base.agent
    def setup(self, obs_spec, action_spec):
        self.obs_spec = obs_spec
        self.action_spec = action_spec

    # from the origin base.agent
    def reset(self):
        self.episodes += 1
        # added instead of original
        self.fighting = False
        if self.episodes > 1:
            self.leftover_enemy_hp.append(sum(self.previous_enemy_hp))
            if sum(self.previous_enemy_hp) == 0:
                self.win += 1
            self.dqn.learn()

示例#9

显示文件

class SmartAgent(base_agent.BaseAgent):
    def __init__(self):
        self.dqn = DeepQNetwork(n_actions=524, n_features=13)

        self.previous_action = None
        self.previous_state = None

        self.episodes = 0
        self.steps = 0
        self.reward = 0

        self.reward_weights = np.array([
            .2,  ##blizz_score
            .2,
            .2,  ##total_unit_value, total_structure_value
            .2,
            .3,  ##killed_unit_value, killed_building_value
            .2,
            .2,  ##mineral_rate, mineral_spent
            .2,
            .1,  ##supply_used, supply_limit
            .3,
            .3,  ##army_supply,worker_supply
            .3  #army_count
        ])

    def transformLocation(self, x, x_distance, y,
                          y_distance):  ## Revisit how this is evaluated
        if not self.base_top_left:
            return [x - x_distance, y - y_distance]

        return [x + x_distance, y + y_distance]

    def step(self, obs):
        super(SmartAgent, self).step(obs)

        blizz_score = obs.observation['score_cumulative'][0]
        total_unit_value = obs.observation['score_cumulative'][3]
        total_structure_value = obs.observation['score_cumulative'][4]
        killed_unit_value = obs.observation['score_cumulative'][5]
        killed_building_value = obs.observation['score_cumulative'][6]
        mineral_rate = obs.observation['score_cumulative'][9]
        mineral_spent = obs.observation['score_cumulative'][11]

        mineral_count = obs.observation['player'][1]  ##7th
        supply_used = obs.observation['player'][3]
        supply_limit = obs.observation['player'][4]
        army_supply = obs.observation['player'][5]
        worker_supply = obs.observation['player'][6]
        army_count = obs.observation['player'][8]

        ## This should also take feature layers
        current_state = np.array([
            blizz_score, total_unit_value, total_structure_value,
            killed_unit_value, killed_building_value, mineral_rate,
            mineral_spent, mineral_count, supply_used, supply_limit,
            army_supply, worker_supply, army_count
        ])  ## New state? 0 or 1 based on position?

        ## Choose action
        rl_action = self.dqn.choose_action(
            current_state, list(obs.observation['available_actions']))

        reward = 0
        if self.steps > 1:
            reward = np.delete(current_state, 7) - np.delete(
                self.previous_state, 7)
            reward = (reward > 0).astype(int)
            reward = np.sum(np.dot(reward, self.reward_weights))
            #print reward

            ## Store transition
            self.dqn.store_transition(self.previous_state,
                                      self.previous_action, reward,
                                      current_state)

            ## Learn
            self.dqn.learn()

        self.previous_state = current_state
        self.previous_action = rl_action

        args = [[np.random.randint(0, size) for size in arg.sizes]
                for arg in self.action_spec.functions[rl_action].args]

        return actions.FunctionCall(rl_action, args)

示例#10

显示文件

文件： run_dqn.py 项目： logsha/grid_game

        if i < show_ti:
            print("end one episode")
        if r_sum > 0:
            win_time += 1
    print("test done~~~")
    print(win_time)


for episode in range(EP_MAX):
    # initial observation
    s = transform(board.random_start())
    r_sum = 0.0
    while True:

        # RL choose action based on observation
        action = dqn.choose_action(s, option(s), GLOBAL_N)
        # print(action)
        #if good() or bad():
        #	print(s[0:GLOBAL_M], action/GLOBAL_N, action%GLOBAL_N)
        # RL take action and get next observation and reward
        if MODE == "random":
            reward, s_, done = board.move(action / GLOBAL_N, action % GLOBAL_N)
        else:
            board.decide(action / GLOBAL_N, action % GLOBAL_N)
            action_space = dqn.rival(transform(board.get_board()))
            reward, s_, done = board.rival(action_space)
        r_sum += reward
        step += 1
        s_ = transform(s_)
        dqn.store_transition(s, action, reward, s_)