Пример #1
0
def initialize(args):
    global savedir

    savedir = '../instances'
    if not os.path.exists(savedir):
        os.makedirs(savedir)

    savedir = '../instances/{}'.format(args.save_instance)
    if not os.path.exists(savedir):
        os.makedirs(savedir)
        os.makedirs(savedir + '/agent_model')

    # Define PyTorch device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Variable env contains the environment class (python game)
    env = gym.envs.make("CartPole-v1")

    # Define policy and target networks
    policy_net = DQN(batch_size, learning_rate, 4, 2).to(device).float()
    target_net = DQN(batch_size, learning_rate, 4, 2).to(device).float()
    # Copy the weights
    target_net.load_state_dict(policy_net.state_dict())
    # Do not backpropagate target network
    target_net.eval()

    memory = ReplayMemory(memory_size)
    strategy = EpsilonGreedy(eps_start, eps_end, eps_decay)
    agent = Agent(policy_net, target_net, memory, strategy, gamma, 2, device)

    return env, agent
Пример #2
0
 def __init__(self,
              starting_state,
              action_space,
              alpha=0.5,
              gamma=0.95,
              exploration_strategy=EpsilonGreedy()):
     self.state = starting_state
     self.action_space = action_space
     self.action = None
     self.alpha = alpha
     self.gamma = gamma
     self.q_table = {self.state: [0 for _ in range(action_space.n)]}
     self.q1_table = {self.state: [0 for _ in range(action_space.n)]}
     self.q2_table = {self.state: [0 for _ in range(action_space.n)]}
     self.exploration = exploration_strategy
Пример #3
0
 def __init__(self,
              starting_state,
              state_space,
              action_space,
              alpha=0.5,
              gamma=0.95,
              exploration_strategy=EpsilonGreedy()):
     super(QLAgent, self).__init__(state_space, action_space)
     self.state = starting_state
     self.action_space = action_space
     self.action = None
     self.alpha = alpha
     self.gamma = gamma
     print('self.state:', self.state)
     self.q_table = {self.state: [0 for _ in range(action_space)]}
     self.exploration = exploration_strategy
     self.acc_reward = 0
Пример #4
0
def epsilon_greedy_algo():
    # epsilon = 0.0: profit-maximization: only good options but you will never explore
    #
    # epsilon = 1.0: A/B test: wastes resources acquiring data about bad options

    epsilon = 0.1
    n_sim = 5000
    horizon = 250
    filename = 'EG'
    mean_probs = [0.1, 0.1, 0.1, 0.1, 0.9]
    algo = EpsilonGreedy(epsilon, [], [])

    test_algo_monte_carlo(algo,
                          mean_probs,
                          n_sim=n_sim,
                          horizon=horizon,
                          filename=filename,
                          store_it=True)
Пример #5
0
from bernoulli import BernoulliArm
from epsilon_greedy import EpsilonGreedy
from test_framework import *
import random

random.seed(1)
means = [0.1, 0.1, 0.1, 0.1, 0.9]
n_arms = len(means)
random.shuffle(means)
arms = list(map(lambda mu: BernoulliArm(mu), means))

print("arms: " + str(means))

f = open("results/greedy_results.tsv", "w")

for epsilon in [0.1, 0.2, 0.3, 0.4, 0.5]:
    algo = EpsilonGreedy(epsilon, [], [])
    algo.initialize(n_arms)
    results = test_algorithm(algo, arms, 5000, 250)
    for i in range(len(results[0])):
        f.write(str(epsilon) + "\t")
        f.write("\t".join([str(results[j][i]) for j in range(len(results))]) + "\n")

f.close()

        algo.update(chosen_arm, reward)
    return [sim_nums, times, chosen_arms, rewards, cumulative_rewards]


if __name__ == '__main__':
    random.seed(1)
    means = [0.1, 0.1, 0.1, 0.1, 0.9]
    n_arms = len(means)
    random.shuffle(means)
    arms = map(lambda (mu): BernoulliArm(mu), means)
    print("Best Arm is")

    #lets choose an epsilon to test the epsilon_greedy algo:
    eps = 0.1
    algo_ = EpsilonGreedy(eps, [], [])
    algo_.initialize(n_arms)
    num_sims = 5000
    horizon = 250
    chosen_arms = [0.0 for i in xrange(num_sims * horizon)]
    print len(chosen_arms)
    rewards = [0.0 for i in xrange(num_sims * horizon)]
    print len(rewards)
    cumulative_rewards = [0.0 for i in xrange(num_sims * horizon)]
    print len(cumulative_rewards)
    sim_nums = [0.0 for i in xrange(num_sims * horizon)]
    times = [0.0 for i in xrange(num_sims * horizon)]

    for sim in xrange(num_sims):
        sim = sim + 1
        algo_.initialize(len(arms))
Пример #7
0
    def __init__(self, agent_name, action_names, training, epsilon_testing,
        state_shape, checkpoint_dir, render=False, use_logging=True):
        """
        Create agent object instance. Will initialise the replay memory
        and neural network
        
        Args:
            agent_name (str): Name of agent
            training (bool): Whether the agent is training the neural network (True)
                            or playing in test model (False)
            render (bool): Wjether to render the game (redundant)
            use_logging (bool): Whether to log to text files during training

        """
        self.agent_name = agent_name
        self.checkpoint_dir = checkpoint_dir

        # The number of possible actions that the agent may take in every step.
        self.num_actions = len(action_names)

        # Whether we are training (True) or testing (False).
        self.training = training

        # Whether to render each image-frame of the game-environment to screen.
        self.render = render

        # Whether to use logging during training.
        self.use_logging = use_logging

        # Set shape of state that will be input
        self.state_shape = state_shape

        if self.use_logging and self.training:
            # Used for logging Q-values and rewards during training.
            self.log_q_values = LogQValues()
            self.log_reward = LogReward()
        else:
            self.log_q_values = None
            self.log_reward = None


        # List of string-names for the actions in the game-environment.
        self.action_names = action_names

        # Initialise epsilon greedy
        self.epsilon_greedy = EpsilonGreedy(start_value=1.0,
                                            end_value=epsilon_testing,
                                            num_iterations=5e6,
                                            num_actions=self.num_actions,
                                            epsilon_testing=epsilon_testing)

        if self.training:
            # The following control-signals are only used during training.

            # The learning-rate for the optimizer decreases linearly.
            self.learning_rate_control = LinearControlSignal(start_value=0.00001,
                                                             end_value=0.00001,
                                                             num_iterations=1e5)

            # The loss-limit is used to abort the optimization whenever the
            # mean batch-loss falls below this limit.
            self.loss_limit_control = LinearControlSignal(start_value=0.0,
                                                          end_value=0.0,
                                                          num_iterations=50000)

            # The maximum number of epochs to perform during optimization.
            self.max_epochs_control = LinearControlSignal(start_value=5.0,
                                                          end_value=1.0,
                                                          num_iterations=1e5)

            # The fraction of the replay-memory to be used.
            # Early in the training, we want to optimize more frequently
            # so the Neural Network is trained faster and the Q-values
            # are learned and updated more often. Later in the training,
            # we need more samples in the replay-memory to have sufficient
            # diversity, otherwise the Neural Network will over-fit.
            self.replay_fraction = LinearControlSignal(start_value=0.1,
                                                       end_value=1.0,
                                                       num_iterations=5e6)

        else:
            # We set these objects to None when they will not be used.
            self.learning_rate_control = None
            self.loss_limit_control = None
            self.max_epochs_control = None
            self.replay_fraction = None

        if self.training:
            # We only create the replay-memory when we are training the agent,
            # because it requires a lot of RAM.
            self.replay_memory = ReplayMemory(size=16000, state_shape=self.state_shape,
                                              num_actions=self.num_actions, checkpoint_dir=checkpoint_dir)
        else:
            self.replay_memory = None

        # Create the Neural Network used for estimating Q-values.
        self.model = NeuralNetwork(model_name=agent_name, input_shape=self.state_shape, num_actions=self.num_actions, 
            checkpoint_dir=checkpoint_dir, replay_memory=self.replay_memory, training=self.training)

        # Record episode states. In the case of poker,
        # a hand constitutes an episode.
        self.episode_states = []
        self.episode_q_values = []
        self.episode_actions = []
        self.episode_epsilons = []
        self.hand_rewards = []

        # Log of the rewards obtained in each episode during calls to run()
        self.episode_rewards = []

        self.min_max_scaling = lambda a, b, min_x, max_x, x: a + ((x - min_x) * (b - a)) / (max_x - min_x)

        self.write_state_action = False
        self.output_path = "./output/player_actions/player_" + str(self.agent_name) + "_actions.csv"
        self.action_space = ['CALL', 'ALL_IN', 'CHECK', 'FOLD']

        with open(checkpoint_dir + "action_config.yaml", 'r') as yaml_file:
            self.action_config = yaml.load(yaml_file, Loader=yaml.FullLoader)

        raise_action_space = self.action_config['raise_actions']

        self.action_space.extend(raise_action_space)
        self.raise_idxs = list(range(4, len(raise_action_space) + 4))
        self.raise_multiples = self.action_config['raise_multiples']

        self.set_fold_q = self.action_config['set_fold_q']
Пример #8
0
    rl_env = SumoEnvironment(rl_params,
                             out_csv_name=out_csv,
                             phases=signal_phase)

    # initialize the states
    initial_states = rl_env.reset()

    # initialize the agent
    rl_agent = rl_params.get('DEFAULT', 'rl_agent')

    agent = Agent(starting_state=rl_env.encode_states(initial_states),
                             action_space=rl_env.action_space,
                             alpha=float(rl_params.get('DEFAULT', 'alpha')),
                             gamma=float(rl_params.get('DEFAULT', 'gamma')),
                             exploration_strategy=EpsilonGreedy(initial_epsilon=float(rl_params.get('DEFAULT', 'epsilon')),
                                                                min_epsilon=float(rl_params.get('DEFAULT', 'minimum_epsilon')),
                                                                decay=float(rl_params.get('DEFAULT', 'decay')))
                    )
    step = 0 # initialize simulations step
    while step < simulation_step:
        # take a step
        action = agent.act(step)
        step += 1
        # compute next_state and reward
        next_state, reward = rl_env.step(actions=action)
        if rl_agent == 'ql':
            # Apply Q-Learning
            agent.learn_q(new_state=rl_env.encode_states(next_state), reward=reward)
            # Apply sarsa learnign
        elif rl_agent == 'sarsa':
            agent.learn_sarsa(new_state=rl_env.encode_states(next_state), reward=reward)