Exemplo n.º 1
0
 def __init__(self, username, train):
     self.state_size_playing = 60
     self.action_size_playing = 20
     self.state_size_announcing = 48
     self.action_size_announcing = 21
     self.player_count = 6
     self.username = username
     self.playing_reward = 0
     self.announcing_reward = 0
     self.players = None
     self.state = None
     self.last_playing_state = None
     self.last_playing_action = None
     self.last_announcing_state = None
     self.last_announcing_action = None
     self.wrong_move = False
     self.train = train
     self.agent_playing = QAgent(self.state_size_playing,
                                 self.action_size_playing)
     self.agent_announcing = QAgent(self.state_size_announcing,
                                    self.action_size_announcing)
def create_agents(algorithm, nodes):
  """ Create agents that employ a desired reinforcement leanring algorithm.

   Args:
     algorithm (str): name of RL algorithm
     nodes (list of Node): the network nodes

   Returns:
     list of agents
  """
  if algorithm == "minimaxQ":

    opponent_idxs_1 = [2]
    opponent_idxs_2 = [1]

    agents = [MinimaxQAgent(nodes=nodes, opp_idxs=opponent_idxs_1,
                            alpha=args.learning_rate, epsilon=args.epsilon,
                            gamma=args.discount_factor),
              MinimaxQAgent(nodes=nodes, opp_idxs=opponent_idxs_2,
                            alpha=args.learning_rate, epsilon=args.epsilon,
                            gamma=args.discount_factor)]

  elif algorithm == "Qlearning":

    agents = [QAgent(nodes=nodes, alpha=args.learning_rate,
                     epsilon=args.epsilon, gamma=args.discount_factor)]

  elif algorithm == "RomQ":

    agents = [RomQAgent(nodes=nodes, alpha=args.learning_rate,
                        epsilon=args.epsilon,
                        attack_size=args.K,
                        gamma=args.discount_factor)]
  else:
    print("Error: algorithm ", algorithm, " is not implemented.")
    quit()

  return agents
Exemplo n.º 3
0
def exercise_agent(gymName, episodes, render=True, convolutional=False):
    max_t = 0
    env = gym.make(gymName)
    agent = QAgent(env.action_space.n, convolutional)
    for i_episode in range(episodes):
        state = normalize(env.reset())
        agent.observe(state, 0, False)

        total_reward = 0
        for t in range(10000):
            if render:
                env.render()

            action = agent.act()
            state, reward, done, info = env.step(action)
            state = normalize(state)
            total_reward += reward
            agent.observe(state, reward, done)
            if done:
                max_t = max(max_t, t)
                print(f'{t} : {max_t} : {total_reward}')
                break
    env.close()
Exemplo n.º 4
0
]

bad_opt_networks_ckp = "bad_config/qnet_Banana_local_test_87.ckp"
seeded_test_64_ckp = {
    "local": "top_configs/qnet_banana_local_episode_740.ckp",
    "target": "top_configs/qnet_banana_target_episode_740.ckp",
    "delayer": "top_configs/qnet_banana_delayer_episode_740.ckp"
}

sel_network = 1  #select one of the top scoring checkponts. The network will have different parameters
load_from_seeded_64: bool = False  # ignore previus selection and select the re-trained (seeded) network for test_64 configuration. Here the environment was solved (>13.5) after 740 episodes
load_bad_network: bool = False  # inglore the previous selection and load the worst network resulting form optimization.

env = BananaEnv()

agent = QAgent(action_space=env.get_action_space_size(),
               state_space=env.get_state_space_size())
if load_bad_network:
    agent.load_checkpoint(bad_opt_networks_ckp)
elif load_from_seeded_64:  #target and delayer weights are not actually needed here, they would be needed to resume training as it was left.
    agent.load_checkpoint(local_checkpoint=seeded_test_64_ckp["local"],
                          target_checkpoint=seeded_test_64_ckp["target"],
                          delayer_checkpoint=seeded_test_64_ckp["delayer"])
else:
    agent.load_checkpoint(top_opt_networks_ckp[sel_network])

env.reset()
done = False
for i in range(5):
    print("Episode {:d}\n score: ".format(i), end=" ")
    done = False
    env.reset()
Exemplo n.º 5
0
    pars["mem_size_sel"] = random.choice(mem_size_choices)
    pars["update_every_sel"] = random.choice(update_every_choices)
    pars["learn_every_sel"] = random.choice(learn_every_choices)
    pars["learning_rate_sel"] = random.choice(learning_rate_choices)
    pars["eps_decay_sel"] = random.choice(eps_decay_choices)
    pars["double_qnet_sel"] = random.choice(double_qnet_choices)
    pars["delayer_sel"] = random.choice(delayer_choices)

    print(">>> test " + str(test_i))
    print(">>> parameters:")
    print(pars)

    agent = QAgent(state_space=env.get_state_space_size(),
                   action_space=env.get_action_space_size(),
                   layers=pars["layers_sel"],
                   mem_size=pars["mem_size_sel"],
                   use_delayer=pars["delayer_sel"],
                   learning_rate=pars["learning_rate_sel"],
                   double_qnet=pars["double_qnet_sel"])

    env.reset()

    update_every = pars["update_every_sel"]
    learn_every = pars["learn_every_sel"]
    curr_score = 0
    score_window = deque(maxlen=100)  # last 100 scores
    score_list = []
    mean_score_list = []
    running_score = 0
    eps_start = 1.0
    eps_decay = pars["eps_decay_sel"]
Exemplo n.º 6
0
import gym
from q_agent import QAgent
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv

# How long do we play
NUM_EPISODES = 500
# How often we print results
PRINT_EVERY_EPS = 100

environment = FrozenLakeEnv(is_slippery=False)

num_states = environment.observation_space.n
num_actions = environment.action_space.n

agent = QAgent(num_states, num_actions)

sum_reward = 0

for episode in range(NUM_EPISODES):
    done = False
    last_state = environment.reset()
    last_reward = None
    # Number of steps taken. A bit of a safeguard...
    num_steps = 0
    while not done:
        # Epsilon-greedy policy
        action = agent.get_action(last_state, environment)

        state, reward, done, info = environment.step(action)
Exemplo n.º 7
0
import gym
from q_agent import QAgent

env = gym.make('FrozenLake-v0')
print(env.action_space)
print(env.observation_space)

agent = QAgent(env.observation_space, env.action_space);
agent.learn(env)
success = 0
for i_episode in range(100):
        observation = env.reset()
        while True:
                #env.render()
                action = agent.act(observation)
                observation, reward, done, info = env.step(action)
                if done:
                        #print("Episode finished after {} timesteps".format(t+1))
                        if reward == 1.0:
                                success += 1
                        break
print("success rate is {}".format(success))
Exemplo n.º 8
0
import numpy as np
from q_agent import Agent as QAgent

agent = QAgent(env_name='FrozenLake-v0', a=0.3)
training_returns = agent.play_n_episodes(5000, is_training=True)
policy_returns = agent.play_n_episodes(1000, is_training=False)
print('Average return of final policy for FrozenLake:',
      np.mean(policy_returns))

_8x8_agent = QAgent(env_name='FrozenLake8x8-v0', a=0.1, e_step=0.000001)
_8x8_training_returns = _8x8_agent.play_n_episodes(25000, is_training=True)
_8x8_policy_returns = _8x8_agent.play_n_episodes(1000, is_training=False)
print('Average return of final policy for FrozenLake 8x8:',
      np.mean(_8x8_policy_returns))
Exemplo n.º 9
0
#set dqnet and training parameters
layers = [128, 64]  #hidden layers of the neural networks
mem_size = 5000  # capacity of the experience replay buffer, number of experiences
update_every = 2  # update target network every # episodes
eps_start = 1.0
eps_end = 0.01
eps_decay = 0.99  # for epsilon greedy policy in training
learn_every = 4  # trigger learning every # actions
learning_rate = 0.0005
use_delayer = True
double_qnet = True

agent = QAgent(state_space=env.get_state_space_size(),
               action_space=env.get_action_space_size(),
               layers=layers,
               mem_size=mem_size,
               learning_rate=learning_rate,
               use_delayer=use_delayer,
               double_qnet=double_qnet,
               seed=0)

#initialize
random.seed(0)
print(env.reset())
curr_score = 0
score_window = deque(maxlen=100)  # last 100 scores
score_list = []
running_score = 0
eps = eps_start
plt.ion()
fig = plt.figure()
ax = fig.add_subplot(111)
Exemplo n.º 10
0
from v_table import VTable
from q_agent import QAgent
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv

# How long do we play
NUM_EPISODES = 100000
# How often we show current V-estimate
SHOW_EVERY_EPISODES = 10000

environment = FrozenLakeEnv(is_slippery=False)

num_states = environment.observation_space.n
num_actions = environment.action_space.n

vtable = VTable(num_states, discount_factor=0.5)
agent = QAgent(num_states, num_actions)
# Load already trained Q-table
agent.load("q_table.npy")

for episode in range(NUM_EPISODES):
    done = False
    state = environment.reset()
    # Keep track of visited states and rewards
    # obtained
    states = []
    rewards = []
    while not done:
        # Store state
        states.append(state)
        # Take action according to Q-agent
        action = agent.get_action(state, environment)
Exemplo n.º 11
0
from plane_state import PlaneState
from flight_manager import FlightManager
from time import sleep
from agent import Agent
from q_agent import QAgent
import sys

# state = planeState.PlaneState(5691.52001953125, -7.349344253540039, -51844.9609375, 10)
# print state.get_state_vector()

# while True:
#   print state.get_state_vector()
#   sleep(1)
args = sys.argv
args.pop(0)
weight = [float(arg) for arg in args]
if not weight:
    agent = QAgent(54, 7, 12)
else:
    agent = QAgent(54, 7, weight)

flightManager = FlightManager(agent, "apha 0.0005")

print "Starting Flight"
while True:
    flightManager.run_episode()
Exemplo n.º 12
0
class WizardEnv(WizardCallback):
    '''
    state layout playing:
                int
          current round [1,20]
                int
          number of players
                int[6]
          announcements
                int[20 * 2]
          hand cards: 1 int for strength, one for color
                int[6 * 2]
          table cards

          => state_size = 60

    state layout announcing:
                int
          current round [1,20]
                int
          number of players
                int[6]
          announcements
                int[20 * 2]
          hand cards: 1 int for strength, one for color

          => state_size = 48

    card layout:
        1st int: strength: 1-13 for normal card, 14-26 for trump, 27 wizard, 1 fool
        2nd int: color: 1-4 for normal colors, 0 for fool and wizard
    '''
    def __init__(self, username, train):
        self.state_size_playing = 60
        self.action_size_playing = 20
        self.state_size_announcing = 48
        self.action_size_announcing = 21
        self.player_count = 6
        self.username = username
        self.playing_reward = 0
        self.announcing_reward = 0
        self.players = None
        self.state = None
        self.last_playing_state = None
        self.last_playing_action = None
        self.last_announcing_state = None
        self.last_announcing_action = None
        self.wrong_move = False
        self.train = train
        self.agent_playing = QAgent(self.state_size_playing,
                                    self.action_size_playing)
        self.agent_announcing = QAgent(self.state_size_announcing,
                                       self.action_size_announcing)

    def play_game(self):
        self.playing_reward = 0
        self.announcing_reward = 0
        self.players = None
        self.state = None
        self.last_playing_state = None
        self.last_playing_action = None
        self.last_announcing_state = None
        self.last_announcing_action = None
        for agent in [self.agent_announcing, self.agent_playing]:
            agent.memory_buffer = list()
            agent.load_weights()
        self.game = WizardGame(self.username, self)
        self.game.start()

    def send(self, msg):
        self.game.send(msg)

    def on_turn(self, ws, state, players):
        enc_state = self.encode_state(state, players)
        if state["announcing"]:
            if self.train and self.last_announcing_state is not None:
                self.agent_announcing.store_episode(
                    self.last_announcing_state, self.last_announcing_action,
                    self.announcing_reward, enc_state, False)
            action_space = list(range(state["round"] + 1))
            force_random = False
            if self.wrong_move:
                force_random = True
            elif self.train and self.last_playing_state is not None:
                self.agent_playing.store_episode(self.last_playing_state,
                                                 self.last_playing_action,
                                                 self.playing_reward, None,
                                                 True)
                self.last_playing_state = None
            action = self.agent_announcing.compute_action(
                enc_state, action_space, force_random)
            ws.send(json.dumps({"action": "announce", "announcement": action}))

            self.announcing_reward = self.playing_reward = 0
            self.last_announcing_action = action
            self.last_announcing_state = enc_state
        else:
            if self.train and self.last_playing_state is not None:
                self.agent_playing.store_episode(self.last_playing_state,
                                                 self.last_playing_action,
                                                 self.playing_reward,
                                                 enc_state, False)
            action_space = list(range(len(state["hand"])))
            if self.wrong_move:
                action = self.agent_playing.compute_action(
                    enc_state, action_space, True)
            else:
                action = self.agent_playing.compute_action(
                    enc_state, action_space)
            card = state["hand"][action]
            ws.send(json.dumps({"action": "play_card", **card}))
            self.playing_reward = 0
            self.last_playing_action = action
            self.last_playing_state = enc_state
        self.wrong_move = False

    def on_choosing_trump(self, ws, state, players):
        if state["choosing_trump"] == self.username:
            ws.send(json.dumps({"action": "choose_trump", "color": "red"}))

    def on_state_update(self, ws, state):
        self.state = state
        if state["game_over"]:
            if self.train:
                self.agent_playing.store_episode(self.last_playing_state,
                                                 self.last_playing_action,
                                                 self.playing_reward, None,
                                                 True)
                self.agent_announcing.store_episode(
                    self.last_announcing_state, self.last_announcing_action,
                    self.announcing_reward, None, True)
                self.agent_playing.train()
                self.agent_announcing.train()
            self.agent_playing.update_exploration_probability()
            self.agent_announcing.update_exploration_probability()
            ws.close()

    def on_player_update(self, ws, players):
        if self.players:
            old_player = self.get_player(self.players)
            new_player = self.get_player(players)
            if old_player["tricks"] != new_player["tricks"]:
                self.playing_reward += -10 if new_player[
                    "tricks"] > new_player["announcement"] else 5
                if new_player["tricks"] == new_player["announcement"]:
                    self.playing_reward += 15
            if old_player["score"] != new_player["score"]:
                self.announcing_reward += new_player["score"] - old_player[
                    "score"]
        self.players = players

    def on_error(self, ws, msg):
        if msg != 'It\'s not your turn, bitch':
            if msg != 'Nope. Wrong number ¯\\_(ツ)_/¯':
                self.playing_reward -= 10
            else:
                self.announcing_reward -= 10
            self.wrong_move = True

    def encode_state(self, state, players):
        encoded_state = np.zeros(
            self.state_size_announcing if state["announcing"] else self.
            state_size_playing)
        encoded_state[0] = state["round"]
        encoded_state[1] = len(players)
        for (i, p) in enumerate(players):
            encoded_state[i + 2] = p["announcement"]
        for (i, c) in enumerate(state["hand"]):
            nr, color = self.encode_card(c, state["trump"])
            encoded_state[3 + 6 + 2 * i] = nr
            encoded_state[3 + 6 + 2 * i + 1] = color
        if not state["announcing"]:
            for (i, c) in enumerate(state["table"]):
                nr, color = self.encode_card(c, state["trump"])
                encoded_state[3 + 6 + 20 + 2 * i] = nr
                encoded_state[3 + 6 + 20 + 2 * i + 1] = color
        return encoded_state

    def get_player(self, players):
        for p in players:
            if p["name"] == self.username:
                return p
        return None

    def encode_card(self, c, trump):
        if c["type"] == "wizard":
            nr = 27
            color = 0
        elif c["type"] == "fool":
            nr = 1
            color = 0
        elif c["type"] == "number":
            nr = c["number"]
            if trump and trump["type"] == "number" and c["color"] == trump[
                    "color"]:
                nr += 13
            color = self.stc(c["color"])
        return (nr, color)

    def stc(self, color):
        colors = ['red', 'blue', 'green', 'yellow', 'orange']
        return colors.index(color) + 1
Exemplo n.º 13
0
    env_name = "PongDuel-v0"
    num_episodes = 100000
    num_steps = 1000

    agent_args = {
        "n_agent_y": 40,
        "n_ball_y": 40,
        "n_ball_x": 30,
        "n_dir": 6,
        "n_actions": 3,
        "n_enemy_y": 40,
        "epsilon": 0.1
    }

    env = gym.make(env_name)
    agent_0 = QAgent(**agent_args)
    agent_1 = QAgent(**agent_args)

    for e in range(num_episodes):
        cumulative_reward = 0

        obs = env.reset()

        # reinforcement loop
        # while True:
        for _ in range(num_steps):
            state_0, state_1 = get_obs_tuples(obs)
            a_0_y, b_0_y, b_0_x, d_0, e_0_y, = state_0[0], state_0[1], state_0[
                2], state_0[3], state_0[4]
            a_1_y, b_1_y, b_1_x, d_1, e_1_y, = state_1[0], state_1[1], state_1[
                2], state_1[3], state_1[4]
Exemplo n.º 14
0
def main():
    View(QAgent())
Exemplo n.º 15
0
    env = gym.make('CartPole-v0')
    env._max_episode_steps = None

    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)

    action_dist = []
    # multivariate
    # mvn = scipy.stats.multivariate_normal(np.zeros(state_size), np.eye(state_size, state_size))
    # action_dist.append(mvn)
    # action_dist.append(mvn)
    #univariate
    action_dist.append(scipy.stats.norm(0.0, 1.0))
    action_dist.append(scipy.stats.norm(0.0, 1.0))
    agent_q = QAgent(action_dist)

    def train():
        #         file = open('reward.csv','w')
        #         file.write("Episodes"+","+"reward"+"\n")

        file = open('dqn.csv', 'w')
        file.write("Episodes" + "," + "time" + "\n")

        ##dqn agent

        # agent.load("model/cartpole-ddqn.h5")
        done = False
        batch_size = 128

        scores = []