Python SimpleAgent示例，simple_agent.SimpleAgent Python示例

示例#1

0

显示文件

def run_model(game_count=1):
    """
    run model for game_count games
    """

    # Make environment
    env = WhaleEnv(
        config={
            'active_player': 0,
            'seed': datetime.utcnow().microsecond,
            'env_num': 1,
            'num_players': 5
        })
    # Set up agents
    action_num = 3
    agent = SimpleAgent(action_num=action_num, player_num=5)
    agent_0 = RandomAgent(action_num=action_num)
    agent_1 = RandomAgent(action_num=action_num)
    agent_2 = RandomAgent(action_num=action_num)
    agent_3 = RandomAgent(action_num=action_num)
    agents = [agent, agent_0, agent_1, agent_2, agent_3]
    env.set_agents(agents)
    agent.load_pretrained()
    for game in range(game_count):

        # Generate data from the environment
        trajectories = env.run(is_training=False)

        # Print out the trajectories
        print('\nEpisode {}'.format(game))
        i = 0
        for trajectory in trajectories:
            print('\tPlayer {}'.format(i))
            [print(t) for t in trajectory]
            i += 1

示例#2

0

显示文件

文件： simple_chain_experiment_hierarchical.py 项目： idilsuerdenlig/hierarchical_thesis

def experiment():
    np.random.seed(3)
    # MDP
    mdp = generate_simple_chain(state_n=5,
                                goal_states=[2],
                                prob=.8,
                                rew=1,
                                gamma=.9)

    action_space = mdp._mdp_info.action_space
    observation_space = mdp._mdp_info.observation_space
    gamma = mdp._mdp_info.gamma

    # Model Block
    model_block = MBlock(env=mdp, render=False)

    #Policy
    epsilon = Parameter(value=1)
    pi = EpsGreedy(epsilon=epsilon)
    table = Table(mdp.info.size)
    pi.set_q(table)

    #Agents
    mdp_info_agent1 = MDPInfo(observation_space=observation_space,
                              action_space=spaces.Discrete(5),
                              gamma=1,
                              horizon=20)
    mdp_info_agent2 = MDPInfo(observation_space=spaces.Discrete(5),
                              action_space=action_space,
                              gamma=gamma,
                              horizon=10)
    agent1 = SimpleAgent(name='HIGH', mdp_info=mdp_info_agent1, policy=pi)
    agent2 = SimpleAgent(name='LOW', mdp_info=mdp_info_agent2, policy=pi)

    # Control Blocks
    control_block1 = ControlBlock(wake_time=10,
                                  agent=agent1,
                                  n_eps_per_fit=None,
                                  n_steps_per_fit=1)
    control_block2 = ControlBlock(wake_time=1,
                                  agent=agent2,
                                  n_eps_per_fit=None,
                                  n_steps_per_fit=1)

    # Algorithm
    blocks = [model_block, control_block1, control_block2]
    order = [0, 1, 2]
    model_block.add_input(control_block2)
    control_block1.add_input(model_block)
    control_block1.add_reward(model_block)
    control_block2.add_input(control_block1)
    control_block2.add_reward(model_block)
    computational_graph = ComputationalGraph(blocks=blocks, order=order)
    core = HierarchicalCore(computational_graph)

    # Train
    core.learn(n_steps=40, quiet=True)
    return

示例#3

0

显示文件

文件： test_attack.py 项目： bfirner/armada-player-demo

def a_vs_b(ship_a, ship_b, trials, attack_range):
    """This function calculates the average time to destruction when a shoots at b.

    Args:
      ship_a ((Ship, str)): Attacker and hull zone tuple.
      ship_b ((Ship, str)): Defender and hull zone tuple.
      trials (int): Number of trials in average calculation.
      range (str): Attack range.
    
    """
    roll_counts = []
    agent = SimpleAgent()
    for trial in range(trials):
        # Reset ship b for each trial
        ship_b.reset()
        world_state = WorldState()
        world_state.addShip(ship_a, 0)
        world_state.addShip(ship_b, 1)
        num_rolls = 0
        while ship_b.damage_cards() < ship_b.hull():
            num_rolls += 1
            # Handle the attack and receive the updated world state
            world_state = handleAttack(world_state=world_state,
                                       attacker=(ship_a, "front"),
                                       defender=(ship_b, "front"),
                                       attack_range=attack_range,
                                       offensive_agent=agent,
                                       defensive_agent=agent)
        roll_counts.append(num_rolls)
    np_counts = numpy.array(roll_counts)
    return np_counts.mean()

示例#4

0

显示文件

def test3():
    env = ConnectFourEnv(display=True)
    simpleAgent = SimpleAgent(env, 2, 1)

    state, gameOver, winner = env.act(1, 0)
    state, gameOver, winner = env.act(2, 6)
    time.sleep(0.5)

    state, gameOver, winner = env.act(1, 1)
    state, gameOver, winner = env.act(2, 5)
    time.sleep(0.5)

    state, gameOver, winner = env.act(1, 3)
    acttion = simpleAgent.getAction(env.state)
    state, gameOver, winner = env.act(2, acttion)
    time.sleep(5)

示例#5

0

显示文件

文件： decomp_agent.py 项目： chris838/taxi-decomposed

    def __init__(self):

        # Only four navigation actions for subproblem actions
        nA = 4
        # Only taxi row, column and destination index for subproblem states
        states_shape = (5, 5, 4)
        self.sub_agent = SimpleAgent(states_shape=states_shape, nA=nA)

        # Learning rate / step size
        self.sub_agent.alpha = 0.01
        self.sub_agent.alpha_decay = 1
        self.sub_agent.alpha_min = 0

        # Discount
        self.sub_agent.gamma = 1
        self.sub_agent.gamma_decay = 1
        self.sub_agent.gamma_min = 0

        # Exploration
        self.sub_agent.epsilon = 0.01
        self.sub_agent.epsilon_decay = 1
        self.sub_agent.epsilon_min = 0

        # For our params, just mimic the sub-agent's
        (self.alpha, self.epsilon, self.gamma) = \
            self.sub_agent.alpha, self.sub_agent.epsilon, self.sub_agent.gamma

        # Environment priors
        self.action_pickup = 4
        self.action_dropoff = 5
        self.locs = [(0, 0), (0, 4), (4, 0), (4, 3)]
        self.passenger_in_taxi_idx = 4

        print("alpha: {0}, alpha_decay: {1}, alpha_min: {2}".format(
            self.sub_agent.alpha, self.sub_agent.alpha_decay, self.sub_agent.alpha_min))
        print("gamma: {0}, gamma_decay: {1}, gamma_min: {2}".format(
            self.sub_agent.gamma, self.sub_agent.gamma_decay, self.sub_agent.gamma_min))
        print("epsilon: {0}, epsilon_decay: {1}, epsilon_min: {2}".format(
            self.sub_agent.epsilon, self.sub_agent.epsilon_decay, self.sub_agent.epsilon_min))

示例#6

0

显示文件

    def __init__(self, settings):
        self.settings = settings
        self.totalGameNo = settings['total_game_no']
        self.playedGameNo = 0
        self.simStepNo = settings['sim_step_no']
        self.saveStepNo = settings['save_step_no']
        self.display = settings['display']
        self.env = ConnectFourEnv(self.display)
        self.visited = {}           # (stateStr, turn, action), visited
        self.won = {}              # (stateStr, turn, action), won
        self.DRAW = -1
        self.PLAYER = 1
        self.OPP = 2
        self.simpleAgent = SimpleAgent(self.env, self.OPP, self.PLAYER)
        self.winnerResult = {self.DRAW:0, self.PLAYER:0, self.OPP:0}
        self.greedyEpsilon = 0.1

        self.startTime = time.strftime('%Y%m%d_%H%M%S')
        logFile="output/%s.log" % (self.startTime)            
        util.Logger(logFile)

        self.testMode = False
        self.debugger = DebugInput(self).start()

示例#7

0

显示文件

文件： simple.py 项目： HKU-ICRA/MultiAgent-Environment

def make_env(n_substeps=5, horizon=250, deterministic_mode=False):
    '''
        This make_env function is not used anywhere; it exists to provide a simple, bare-bones
            example of how to construct a multi-agent environment using the modules framework.
    '''
    env = Base(n_agents=1,
               n_substeps=n_substeps,
               horizon=horizon,
               floor_size=10,
               grid_size=50,
               deterministic_mode=deterministic_mode,
               env_no=0,
               action_lims=(-250.0, 250.0))

    # Add Walls
    #env.add_module(RandomWalls(grid_size=5, num_rooms=2, min_room_size=5, door_size=5, low_outside_walls=True, outside_wall_rgba="1 1 1 0.1"))

    # Add Agents
    first_agent_placement = custom_placement
    agent_placement_fn = [first_agent_placement]
    env.add_module(SimpleAgent(1, placement_fn=agent_placement_fn))

    env.reset()

    keys_self = ['agent_qpos_qvel']
    keys_mask_self = []  #['mask_aa_obs']
    keys_external = []  #['agent_qpos_qvel']
    keys_mask_external = []
    keys_copy = []

    env = AddConstantObservationsWrapper(
        env, new_obs={'target_pos': np.full((1, 1), 0.0)})
    keys_self += ['target_pos']
    env = SimpleWrapper(env)

    env = SplitMultiAgentActions(env)
    #env = DiscretizeActionWrapper(env, 'action_movement', nbuckets=21)
    env = SplitObservations(env,
                            keys_self + keys_mask_self,
                            keys_copy=keys_copy)
    env = DiscardMujocoExceptionEpisodes(env)

    env = SelectKeysWrapper(env,
                            keys_self=keys_self,
                            keys_external=keys_external,
                            keys_mask=keys_mask_self + keys_mask_external,
                            flatten=False)

    return env

示例#8

0

显示文件

def train_model(max_episodes=100):
    """
    Trains a DQN agent to play the CartPole game by trial and error

    :return: None
    """

    # buffer = ReplayBuffer()
    # Make environment
    env = WhaleEnv(
        config={
            'active_player': 0,
            'seed': datetime.utcnow().microsecond,
            'env_num': 1,
            'num_players': 5
        })
    # Set up agents
    action_num = 3
    agent = SimpleAgent(action_num=action_num, player_num=5)
    agent_0 = NoDrawAgent(action_num=action_num)
    agent_1 = NoDrawAgent(action_num=action_num)
    agent_2 = NoDrawAgent(action_num=action_num)
    agent_3 = NoDrawAgent(action_num=action_num)
    # agent_train = RandomAgent(action_num=action_num)
    agents = [agent, agent_0, agent_1, agent_2, agent_3]
    # train_agents = [agent_train, agent_0, agent_1, agent_2, agent_3]
    env.set_agents(agents)
    agent.load_pretrained()
    min_perf, max_perf = 1.0, 0.0
    for episode_cnt in range(1, max_episodes + 1):
        # print(f'{datetime.utcnow()} train ...')
        loss = agent.train(
            collect_gameplay_experiences(env, agents, GAME_COUNT_PER_EPISODE))
        # print(f'{datetime.utcnow()} eval  ...')
        avg_rewards = evaluate_training_result(env, agents,
                                               EVAL_EPISODES_COUNT)
        # print(f'{datetime.utcnow()} calc  ...')
        if avg_rewards[0] > max_perf:
            max_perf = avg_rewards[0]
            agent.save_weight()
        if avg_rewards[0] < min_perf:
            min_perf = avg_rewards[0]
        print('{0:03d}/{1} perf:{2:.2f}(min:{3:.2f} max:{4:.2f})'
              'loss:{5:.4f} rewards:{6:.2f} {7:.2f} {8:.2f} {9:.2f}'.format(
                  episode_cnt, max_episodes, avg_rewards[0], min_perf,
                  max_perf, loss[0], avg_rewards[1], avg_rewards[2],
                  avg_rewards[3], avg_rewards[4]))
    # env.close()
    print('training end')

示例#9

0

显示文件

def main(_):
    agent = SimpleAgent()
    try:
        while True:
            with sc2_env.SC2Env(
                    map_name="Simple64",
                    players=[
                        sc2_env.Agent(sc2_env.Race.zerg),
                        sc2_env.Bot(sc2_env.Race.random,
                                    sc2_env.Difficulty.very_easy)
                    ],
                    agent_interface_format=features.AgentInterfaceFormat(
                        action_space=actions.ActionSpace.RAW,
                        use_raw_units=True,
                        raw_resolution=64,
                    ),
            ) as env:
                run_loop.run_loop([agent], env)
    except KeyboardInterrupt:
        pass

示例#10

0

显示文件

def test4():
    env = ConnectFourEnv(display=True)
    simpleAgent1 = SimpleAgent(env, 1, 2)
    simpleAgent2 = SimpleAgent(env, 2, 1)

    state = env.getState()
    while True:
        acttion1 = simpleAgent1.getAction(state)
        state, gameOver, winner = env.act(1, acttion1, True)
        time.sleep(0.3)
        if gameOver:
            break
        acttion2 = simpleAgent2.getAction(state)
        state, gameOver, winner = env.act(2, acttion2, True)
        time.sleep(0.3)
        if gameOver:
            break

    if winner == -1:
        print 'Game draw'
    else:
        print 'Player %s won' % winner
    time.sleep(5)

示例#11

0

显示文件

文件： run_eval.py 项目： ruairidhm98/Agents-for-FrozenLake


U_vi = value_iteration(epsilon=0.001)
"""
Collects and writes the results to a file for the Random Agent and 
draws the graph
Draws:
    Mean Reward per Episode vs Episode Number
"""
random_agent = RandomAgent(env_random)
process_data_random(env_random, random_agent, MAX_EPISODES,
                    MAX_ITERS_PER_EPISODE, REWARD_HOLE_SIMPLE, PROBLEM_ID)
"""
Collects and writes the results for the Simple Agent containing
data such as the number of iterations to reach the goal
"""
simple_agent = SimpleAgent(env_simple)
process_data_simple(env_simple, simple_agent, PROBLEM_ID)
"""
Collects and writes the results to a file for the Q-learning Agent 
and draws the graphs.
Draws:
    Mean Reward per Episode vs Episode Number
    Utility Values in each State against Episode Number
"""
states = [i for i in range(64)]
q_learning_agent = QLearningAgent(env_qlearn, NE, RPLUS, GAMMA, ALPHA)
U = process_data_q(env_qlearn, q_learning_agent, MAX_EPISODES,
                   MAX_ITERS_PER_EPISODE, states, PROBLEM_ID, REWARD_HOLE_Q)
compare_utils(U_vi, U, 'Value itr', 'Q learning')

示例#12

0

显示文件

文件： training.py 项目： arvjus/connect4

            board = self.drop_piece(board, col, piece)
            if self.check_if_winning(board, piece):
                winner = piece
                break
            piece = piece % 2 + 1
        self.agent1.game_over(winner)
        self.agent2.game_over(winner)
        return winner

    def end(self):
        self.agent1.teardown()
        self.agent2.teardown()


# run agents
config = Config(6, 7, 4)
agents = [(RandomAgent(config), "rnd", 5000),
          (SimpleAgent(config), "simple", 5000),
          (OneStepLookaheadAgent(config), "1sla", 5000),
          (OneStepLookaheadAgent(config), "1sla", 5000),
          (OneStepLookaheadAgent(config), "1sla", 5000),
          (NStepsLookaheadAgent(config, 2), "2sla", 3000),
          (NStepsLookaheadAgent(config, 3), "3sla", 5000)]
for agent, agent_name, nruns in agents:
    training = Training(config, agent, CNNAgent(config, Network1(),
                                                agent_name))
    for n in range(nruns):
        winner = training.run()
        print("Agent", agent_name, ", game", n, "- player", winner, "wins")
    training.end()

示例#13

0

显示文件

def run_training(
    opponent,
    mcts_opp,
    game_state_file,
    graph_file,
    model_save_file,
    mcts_iters,
    temp,
    tempsteps,
    lr,
    discount,
    memsize,
    num_episodes,
    num_epochs,
    batch_size,
    train_every,
    save_every,
    graph_every,
    averaging_window,
    opt_eps=1e-8,
    ucb_c=1.5,
    boardsize=8,
    inputs=20,
    render=False,
    verbose=False,
):
    env = PommermanEnvironment(
        render=render,
        num_agents=2,
        game_state_file=game_state_file,
    )

    run_settings = RunSettings(
        num_episodes=num_episodes,
        num_epochs=num_epochs,
        batch_size=batch_size,
        train_every=train_every,
        save_every=save_every,
        graph_every=graph_every,
        averaging_window=averaging_window,
        graph_file=graph_file,
        verbose=verbose,
    )

    agent_settings = AgentSettings(
        optimizer=torch.optim.Adam,
        learning_rate=lr,
        opt_eps=opt_eps,
        epsilon_max=0,
        epsilon_min=0,
        epsilon_duration=0,
        verbose=verbose,
    )

    memory = MCTSMemory(buffer_len=memsize, discount=discount)

    if mcts_opp is None:
        mcts_opp = opponent
    if mcts_opp == 'rand':
        opp = pommerman.agents.RandomAgent()
    elif mcts_opp == 'noop':
        opp = PommermanNoopAgent()
    elif mcts_opp == 'simp':
        opp = pommerman.agents.SimpleAgent()
    else:
        raise Exception('Invalid MCTS opponent type', mcts_opp)

    mcts_model = ActorCriticNet(board_size=boardsize, in_channels=inputs)
    agent1 = MCTSAgent(
        mcts_iters=mcts_iters,
        discount=discount,
        c=ucb_c,
        temp=temp,
        tempsteps=tempsteps,
        agent_id=0,
        opponent=opp,
        model_save_file=model_save_file,
        model=mcts_model,
        settings=agent_settings,
        memory=memory,
    )
    agent1.load()

    if opponent == 'rand':
        agent2 = RandomAgent()
    elif opponent == 'noop':
        agent2 = NoopAgent()
    elif opponent == 'simp':
        agent2 = SimpleAgent()
    else:
        raise Exception('Invalid opponent type', opponent)

    experiment = Experiment([agent1, agent2], env, run_settings)
    experiment.train()

示例#14

0

显示文件

import sys
import platform
from absl import logging
from absl import app
from absl import flags

from pysc2.env import sc2_env
from pysc2.env import run_loop

from pysc2.env import remote_sc2_env

# !!! LOAD YOUR BOT HERE !!!
from simple_agent import SimpleAgent
AGENT = SimpleAgent()
RACE = sc2_env.Race.protoss
STEP_MUL = 8
AGENT_INTERFACE_FORMAT = sc2_env.parse_agent_interface_format(
    feature_screen=84,
    feature_minimap=64,
    rgb_screen=None,
    rgb_minimap=None,
    action_space="FEATURES", #FEATURES or RGB
    use_feature_units=False)

# Flags
FLAGS = flags.FLAGS
flags.DEFINE_integer("GamePort", None, "GamePort")
flags.DEFINE_integer("StartPort", None, "StartPort")
flags.DEFINE_string("LadderServer", "127.0.0.1", "LadderServer")
flags.DEFINE_string("OpponentId", None, "OpponentId")

示例#15

0

显示文件

文件： joust_stats.py 项目： bfirner/armada-player-demo

        print("Unrecognized ship name {}".format(args.ship2))
        print("Recognized ship names are:\n")
        for name in ship_templates.keys():
            print("\t{}".format(name))
        exit(1)

for distance in args.ranges:
    if distance not in ["long", "medium", "short"]:
        print("Unknown range for ship combat: {}".format(distance))
        sys.exit(1)

# Set up logging to track what happens during the die rolling.
logging.basicConfig(filename='joust.log', level=logging.DEBUG)

# Agent for the simulation
agent = SimpleAgent()

# Loop through all pairs and have them joust
for ship_name_1 in first_ship_names:
    ship_1 = ship.Ship(name=ship_name_1,
                       template=ship_templates[ship_name_1],
                       upgrades=[],
                       player_number=1)
    for ship_name_2 in second_ship_names:
        for attack_range in ["long", "medium", "short"]:
            # Make sure we are actually rolling dice
            a_colors, a_roll = ship_1.roll("front", attack_range)
            if 0 < len(a_colors):
                roll_counts = []
                print("{} vs {} at range {}".format(ship_name_1, ship_name_2,
                                                    attack_range))

示例#16

0

显示文件

文件： decomp_agent.py 项目： chris838/taxi-decomposed

class DecompAgent:
    """
    This agent takes advantage of the problem sub-structure by decomposing the
    root problem into a navigation subproblem (which it solves using a simple
    Q-learning agent) and using hand-crafted heuristics for all other decisions.
    """

    def __init__(self):

        # Only four navigation actions for subproblem actions
        nA = 4
        # Only taxi row, column and destination index for subproblem states
        states_shape = (5, 5, 4)
        self.sub_agent = SimpleAgent(states_shape=states_shape, nA=nA)

        # Learning rate / step size
        self.sub_agent.alpha = 0.01
        self.sub_agent.alpha_decay = 1
        self.sub_agent.alpha_min = 0

        # Discount
        self.sub_agent.gamma = 1
        self.sub_agent.gamma_decay = 1
        self.sub_agent.gamma_min = 0

        # Exploration
        self.sub_agent.epsilon = 0.01
        self.sub_agent.epsilon_decay = 1
        self.sub_agent.epsilon_min = 0

        # For our params, just mimic the sub-agent's
        (self.alpha, self.epsilon, self.gamma) = \
            self.sub_agent.alpha, self.sub_agent.epsilon, self.sub_agent.gamma

        # Environment priors
        self.action_pickup = 4
        self.action_dropoff = 5
        self.locs = [(0, 0), (0, 4), (4, 0), (4, 3)]
        self.passenger_in_taxi_idx = 4

        print("alpha: {0}, alpha_decay: {1}, alpha_min: {2}".format(
            self.sub_agent.alpha, self.sub_agent.alpha_decay, self.sub_agent.alpha_min))
        print("gamma: {0}, gamma_decay: {1}, gamma_min: {2}".format(
            self.sub_agent.gamma, self.sub_agent.gamma_decay, self.sub_agent.gamma_min))
        print("epsilon: {0}, epsilon_decay: {1}, epsilon_min: {2}".format(
            self.sub_agent.epsilon, self.sub_agent.epsilon_decay, self.sub_agent.epsilon_min))

    def select_action(self, state):

        # Override epsilon-greedy exploration for pickup/dropoff
        if self.can_pick_up(state):
            return self.action_pickup
        if self.can_drop_off(state):
            return self.action_dropoff

        # Otherwise, defer to the sub-agent
        transformed_state = self.transform_state(state)
        return self.sub_agent.select_action(transformed_state)

    def step(self, state, action, reward, next_state, done):
        # Transform experience into the problem space of the sub-agent

        # If the selected action was pickup/dropoff, then experience is not
        # relevant to sub-problem
        if action == self.action_pickup or action == self.action_dropoff:
            return

        # If we can pickup/dropoff in the next state, then for the
        # sub-problem we consider next_state to be terminal and the episode
        # concluded
        if self.can_pick_up(next_state) or self.can_drop_off(next_state):
            state_t = self.transform_state(state)
            action_t = self.transform_action(action)
            reward_t = 9  # end of episode reward
            next_state_t = self.transform_state(next_state)
            done_t = True

        # Otherwise, transform relatively unchanged for sub-problem
        else:
            state_t = self.transform_state(state)
            action_t = self.transform_action(action)
            reward_t = -1
            next_state_t = self.transform_state(next_state)
            done_t = False

        # Pass transformed experience to sub-agent
        self.sub_agent.step(state_t, action_t, reward_t, next_state_t, done_t)
        (self.alpha, self.epsilon, self.gamma) = \
            self.sub_agent.alpha, self.sub_agent.epsilon, self.sub_agent.gamma

    def transform_state(self, state):
        """Transform state into the problem space of the sub-agent"""

        taxi_row, taxi_col, pass_idx, dest_idx = self.decode_state(state)

        # If we don't have the passenger, passenger is our destination
        if pass_idx != self.passenger_in_taxi_idx:
            dest_idx_t = pass_idx
        # If we have the passenger, destination is our destination
        else:
            dest_idx_t = dest_idx

        # Encode in subproblem state space and return
        return (taxi_row, taxi_col, dest_idx_t)

    def transform_action(self, action):
        # Action space is the same, minus the final two actions
        assert action != self.action_pickup
        assert action != self.action_dropoff
        return action

    def can_pick_up(self, state):
        taxi_row, taxi_col, pass_idx, dest_idx = self.decode_state(state)

        # Can't pickup if passenger already in taxi
        if pass_idx == self.passenger_in_taxi_idx:
            return False

        # Otherwise, taxi must be colocated with passenger
        return (taxi_row, taxi_col) == self.locs[pass_idx]

    def can_drop_off(self, state):
        taxi_row, taxi_col, pass_idx, dest_idx = self.decode_state(state)

        # Can't dropoff if passenger not in taxi
        if pass_idx != self.passenger_in_taxi_idx:
            return False

        # Otherwise, taxi must be colocated with destination
        return (taxi_row, taxi_col) == self.locs[dest_idx]

    def decode_state(self, i):
        out = []
        out.append(i % 4)
        i = i // 4
        out.append(i % 5)
        i = i // 5
        out.append(i % 5)
        i = i // 5
        out.append(i)
        assert 0 <= i < 5
        taxi_row, taxi_col, pass_idx, dest_idx = reversed(out)
        return taxi_row, taxi_col, pass_idx, dest_idx

示例#17

0

显示文件

class MCTS:
    def __init__(self, settings):
        self.settings = settings
        self.totalGameNo = settings['total_game_no']
        self.playedGameNo = 0
        self.simStepNo = settings['sim_step_no']
        self.saveStepNo = settings['save_step_no']
        self.display = settings['display']
        self.env = ConnectFourEnv(self.display)
        self.visited = {}           # (stateStr, turn, action), visited
        self.won = {}              # (stateStr, turn, action), won
        self.DRAW = -1
        self.PLAYER = 1
        self.OPP = 2
        self.simpleAgent = SimpleAgent(self.env, self.OPP, self.PLAYER)
        self.winnerResult = {self.DRAW:0, self.PLAYER:0, self.OPP:0}
        self.greedyEpsilon = 0.1

        self.startTime = time.strftime('%Y%m%d_%H%M%S')
        logFile="output/%s.log" % (self.startTime)            
        util.Logger(logFile)

        self.testMode = False
        self.debugger = DebugInput(self).start()

    def initializeProcesses(self):
        # Multi process jobs
        self.multiCpuNo = self.settings['multi_cpu_no']
        self.queueList = []
        self.processList = []
        self.queueChild2Parent = Queue()
        for i in range(self.multiCpuNo):        
            queueParent2Child = Queue()
            self.queueList.append(queueParent2Child)
            #print 'creating a child process[%s]' % i
            p = Process(target=self.simulateOne, args=(i, self.simStepNo / self.multiCpuNo, 
                                                       queueParent2Child, self.queueChild2Parent))
            p.start()
            self.processList.append(p)

    def __getstate__(self):
        d = dict(self.__dict__)
        del d['queueList']
        del d['processList']
        del d['queueChild2Parent']
        return d

    def printEnv(self):
        print 'Start time: %s' % self.startTime
        print '[ Running Environment ]'
        for key in self.settings.keys():
            print '{} : '.format(key).ljust(30) + '{}'.format(self.settings[key])
        print 'width: %s, height: %s' % (self.env.width, self.env.height)
    
    def getStateStr(self, state):
        #return np.array_str(state)
        return hash(state.tostring())
    
    def simulate(self, orgState):
        time1 = time.time()
        for i in range(self.multiCpuNo):
            self.queueList[i].put((orgState, self.visited, self.won))
            
        finishedChildNo = 0
        for i in range(self.multiCpuNo):
            childID, winnerList, historyList, expandedList = self.queueChild2Parent.get()
            
            for expandedNode in expandedList:
                if expandedNode not in self.visited:
                    self.visited[expandedNode] = 0
                    self.won[expandedNode] = 0
            
            for winner, history in zip(winnerList, historyList):
                self.updateTreeInfo(winner, history)
            
            finishedChildNo += 1
            
            #print 'simulateOne done %s' % childID
            if finishedChildNo == self.multiCpuNo:
                break
        #print 'all simulateOne finished'
        time2 = time.time()
        
        #print 'simulte took %.2f sec' % (time2 - time1)
        

    def simulateOne(self, id, simStepNo, queueParent2Child, queueChild2Parent):
        while True:
            orgState, visited, won = queueParent2Child.get()
            self.visited = visited
            self.won = won
            self.env.reset()
            self.env.setState(orgState)
            
            self.visited['haha'] = 'dj'

            historyList = []
            winnerList = []
            expandedList = []
            state = orgState.copy()
            turn = self.PLAYER
            history = []
            expanded = False
    
            for i in range(simStepNo):
                if turn == self.PLAYER:
                    availableActions = self.env.availableActions(state)
                    stateStr = self.getStateStr(state)
                    totalStateVisited = 0
                    # check every actions are visited before
                    for action in availableActions:
                        stateActionPair = (stateStr, turn, action)
                        if stateActionPair in self.visited:
                            totalStateVisited += self.visited[stateActionPair]
                        else:
                            totalStateVisited = 0
    
                    if totalStateVisited == 0:
                        action = self.getRandomAction(state)
                    else:
                        maxUpperBound = 0            
                        for action in availableActions:
                            stateActionPair = (stateStr, turn, action)
                            won = self.won.get(stateActionPair, 0)
                            visited = max(self.visited.get(stateActionPair, 1), 1)
                            winRatio = float(won) / visited
                            upperBound = winRatio + math.sqrt(2 * math.log(totalStateVisited) / visited)
                            if upperBound >= maxUpperBound:
                                maxUpperBound = upperBound
                                selectedAction = action
                        action = selectedAction
                elif turn == self.OPP:
                    if 'sim_opp_policy' in self.settings and self.settings['sim_opp_policy'] == 'simple':
                        action = self.simpleAgent.getAction(state)
                    else:
                        action = self.getRandomAction(state)
                
                stateStr = self.getStateStr(state)
                stateActionPair = (stateStr, turn, action)
                if expanded == False and stateActionPair not in self.visited:
                    canExpand = True
                    expanded = True
                else:
                    canExpand = False
                    
                state, gameOver, winner = self.doAction(state, action, turn, history, expandedList, canExpand, False)
                              
                if turn == self.PLAYER:
                    turn = self.OPP
                else:
                    turn = self.PLAYER
    
                if gameOver:
                    self.updateTreeInfo(winner, history)
                    historyList.append(history)
                    winnerList.append(winner)
                    
                    # restart sim
                    self.env.reset()
                    self.env.setState(orgState)
                    state = orgState.copy()                
                    turn = self.PLAYER
                    history = []
                    expanded = False
                    continue
    
            queueChild2Parent.put((id, winnerList, historyList, expandedList))

    def getRandomAction(self, state, availableActions=None):
        if availableActions == None:
            availableActions = self.env.availableActions(state)
        actionIndex = random.randint(0, len(availableActions)-1)
        return availableActions[actionIndex]

    def getAction(self, state, turn):
        availableActions = self.env.availableActions(state)
        
        if len(availableActions) == 1:
            return availableActions[0]
        
        maxAction = -1
        maxWinRatio = 0
        availableActions = self.env.availableActions(state)
        stateStr = self.getStateStr(state)
        for action in availableActions:
            stateActionPair = (stateStr, turn, action)
            if stateActionPair not in self.visited:
                continue
            winRatio = float(self.won.get(stateActionPair, 0)) / max(self.visited.get(stateActionPair, 1), 1)
            if winRatio >= maxWinRatio:
                maxWinRatio = winRatio
                maxAction = action

        return maxAction
        
    def doAction(self, state, action, turn, history, expandedList, canExpand, display):
        newState, gameOver, winner = self.env.act(turn, action, display)
        
        stateStr = self.getStateStr(state)
        stateActionPair = (stateStr, turn, action)
        if stateActionPair not in self.visited and canExpand:
            self.visited[stateActionPair] = 0
            self.won[stateActionPair] = 0
            if expandedList != None:
                expandedList.append(stateActionPair)
        history.append(stateActionPair)
        return newState, gameOver, winner
        
    def updateTreeInfo(self, winner, history):
        """ Update win result from the current node to the top node """

        for stateActionPair in history:
            if stateActionPair in self.visited:
                self.visited[stateActionPair] += 1
                _, turn, _ = stateActionPair
                if turn == winner:
                    self.won[stateActionPair] += 1
    
    def printHistory(self, history):
        step = 0
        print '\n[ history ]'
        for stateActionPair in history:
            state, turn, action = stateActionPair
            if stateActionPair in self.visited:
                visited = self.visited[stateActionPair]
                won = self.won[stateActionPair]
            else:
                visited = 0
                won = 0
                
            print 'step[%s] turn=%s, action=%s, visited=%s, won=%s' % \
                    (step, turn, action, visited, won)
            step += 1
        print ''
        
    def printResult(self):
        print 'total states: %s' % len(self.visited)
                    
    def save(self, step):
        if os.path.exists('snapshot') == False:
            os.makedirs('snapshot')
        fileName = 'snapshot/mcts_%s' % step
        with open(fileName + '.pickle', 'wb') as f:
            pickle.dump(self, f)
        
    def gogo(self):
        self.initializeProcesses()

        lastResult = []
        lastResultWin = 0
        for i in range(self.totalGameNo):
            self.env.reset()
            state = self.env.getState()
            history = []
            turn = random.randint(self.PLAYER, self.OPP)
            startTime = time.time()
            
            while True:
                if turn == self.PLAYER:
                    self.simulate(state)
                    if settings['player_action'] == 'egreedy':
                        action = self.getActionEGreedy(state, self.PLAYER)
                    else:
                        action = self.getAction(state, self.PLAYER)
                elif turn == self.OPP:
                    if settings['opponent'] == 'user':
                        action = self.env.getManualAction(state)
                    else:
                        action = self.simpleAgent.getAction(state)
                
                state, gameOver, winner = self.doAction(state, action, turn, history, None, True, True)

                if gameOver:
                    break
                
                if turn == self.PLAYER:
                    turn = self.OPP
                else:
                    turn = self.PLAYER

            elapsed = time.time() - startTime
            
            if settings['opponent'] == 'user':
                self.env.showWinner(winner)
                
            self.playedGameNo += 1
            
            self.winnerResult[winner] += 1
            if winner == -1:
                print 'Game draw'
            else:
                mcts.updateTreeInfo(winner, history)
                if winner == self.PLAYER:
                    lastResultWin += 1
                if len(lastResult) == 100:
                    todel = lastResult.pop(0)
                    if todel == 1:
                        lastResultWin -= 1
                lastResult.append(winner)
                lastRatio = float(lastResultWin) * 100 / len(lastResult)
                #mcts.printResult()
                winRatio = float(self.winnerResult[self.PLAYER]) * 100 \
                                     / (self.winnerResult[self.OPP] + self.winnerResult[self.PLAYER])
                
                if winner == 1:
                    winStr = 'Win'
                else:
                    winStr = 'Lose'
                print 'Game %s : %s, %s, total=%.0f%%, last 100=%.0f%%, %.1fs' % (self.playedGameNo, self.winnerResult, winStr, winRatio, lastRatio, elapsed)
            
            if self.playedGameNo % self.saveStepNo == 0:
                self.save(self.playedGameNo)
            #time.sleep(5)
    
        self.debugger.finish()