示例#1
0
 def runModel(self):
     """
     Each agent samples the realityMap and generates their own agentMap
     then stitch together the collectiveMap
     then compare the collectiveMap to the realityMap
     """
     for agent in self.agentList:
         agent.run()
     self.makeCollectiveMap()
     self.calculateDifference()
    def test_run(self):
        try:
            # Verify termination condition on bounded run
            count = run(1)
            self.assertEqual(count, 1)

        except Exception as e:
            self.fail(e)
示例#3
0
文件: helper.py 项目: thomalm/udacity
def run_experiment(alpha, epsilon, gamma, selection, n_runs, trials, f=False):
    """
    Perform n_runs experiments and visualize the results
    """

    stats = []

    # run n_trial experiments
    for i in range(n_runs):
        # Run simulation
        s, m, r = agent.run(alpha, epsilon, gamma, selection, trials, f)
        stats.append(sum(s))
        # Visualize results
        visualize_run(s, m, r, trials, n_runs, i)

    print "Average success rate {0:.3f} %".format((np.mean(stats) / trials) * 100)
示例#4
0
from common import *
import tqdm
import numpy as np
from agent import run

import gym
import sys

sys.path.append("..")
import gym_tictactoe  # Needed to add 'TicTacToe-v0' into gym registry

import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# Number of cognitive cycles to execute (None -> forever)
# Number of cognitive cycles to execute (None -> forever)
N_STEPS = 1000
experimenting = True

if __name__ == '__main__':
    rewards = []
    for i in tqdm.tqdm(range(100)):
        environment = gym.make('TicTacToe-v0')
        environment.reset()
        _, reward = run(environment, n=N_STEPS, render=not experimenting)
        rewards.append(reward)

    print(np.mean(rewards))
import agent

if __name__ == '__main__':
	# Perform exhaustive search over all paramter combinations
	# Save parameters and scores into csv file for easy sorting and plotting

	# Declare parameter ranges to search over
	sigmoid_offset_range = [4., 6., 8.]
	sigmoid_rate_range   = [10**i for i in range(-3, 0)]
	alpha_decay_range    = [0.1, 0.5, 0.9]
	gamma_range          = [0.1, 0.5, 0.9]

	csv_string = 'sigmoid_offset,sigmoid_rate,alpha_decay,gamma,score\n'
	score = -1.

	# Sweep over all paramter combinations
	for sigmoid_offset in sigmoid_offset_range:
		for sigmoid_rate in sigmoid_rate_range:
			for alpha_decay in alpha_decay_range:
				for gamma in gamma_range:
					# Run trials with given parameter combination
					score = agent.run(sigmoid_offset=sigmoid_offset, sigmoid_rate=sigmoid_rate, alpha_decay=alpha_decay, gamma=gamma)

					csv_string += '%f,%f,%f,%f,%.4f\n' % (sigmoid_offset,sigmoid_rate,alpha_decay,gamma,score)
					print 'Hyper-parameter search status: %f,%f,%f,%f,%.4f' % (sigmoid_offset,sigmoid_rate,alpha_decay,gamma,score)  # [debug]

	# Write results to hyper_params.csv
	fo = open('hyper_params.csv', 'wb')
	fo.write(csv_string);
	fo.close()
from agent import run

search_values = [0.01, 0.03, 0.05, 0.07, 0.1, 0.3, 0.5, 0.7]

for alpha in search_values:
    for gamma in search_values:
        for epsilon in search_values:
            run(alpha, gamma, epsilon)
示例#7
0
def run(env: Union[Env, str],
        trials: int = 1,
        episodes: int = 10000,
        steps: Optional[int] = None,
        discount: float = 1.0,
        starting_q: Optional[float] = None,
        exploration_rate: float = 1.0,
        exploration_rate_decay: float = 0.99,
        min_exploration_rate: float = 0.00,
        delta: float = 0.001,
        c: float = 0.0,
        lamb: float = 1.0,
        omega: float = 0.8,
        verbose: Optional[int] = None,
        save: bool = True,
        save_dir: str = 'results',
        plot: bool = True,
        smoothing: float = 0.05,
        iqr: float = 0.0):
    """
    Runs three agents (UCB-H+, UCB, and Q-Learning) in a given environment
    :param env: Environment: either an OpenAI Gym environment or a string;
    in the latter case gym.make(env) will be used.
    :param trials: Number of trials to run
    :param episodes: Number of episodes in each trial
    :param steps: Number of time steps per episode
    :param discount: Discounting factor
    :param starting_q: Initial Q-values for Q-Learning. UCB-H+ and UCB-H infer these from the environment
    :param exploration_rate: Initial exploration rate for Q-Learning
    :param exploration_rate_decay: Exploration rate decay for Q-Learning; each time step the exploration rate is
    multiplied by this until it reaches the minimum exploration rate
    :param min_exploration_rate: Minimum exploration rate for Q-Learning
    :param delta: PAC-probability delta for UCB-H and UCB-H+
    :param c: UCB-constant c for UCB-H and UCB-H+
    :param lamb: lambda coefficient for UCB-H+; this is added to the numerator and demoninator of the learning rate
    :param omega: power coefficient for UCB-H+
    :param verbose: how much information to output to console: from 0 (None) to 4 (a lot of information)
    :param save: whether to save the experiment data or not
    :param save_dir: directory where the data will be saved
    :param plot: whether to plot the experiment data or not
    :param smoothing: moving-average smoothing relative to the number of episodes
    :param iqr: inter-quantile range for plotting
    """
    import agent
    import environment  # this is required for custom environments to show up in the OpenAI Gym registry

    # Initialize the environment an make it a TimeLimit environment for episodic learning
    if isinstance(env, str):
        env_name = env
        env = make(env_name)
    else:
        env_name = type(env).__name__
    if not isinstance(env, TimeLimit):
        assert steps is not None, 'The number of steps per episode is not given'
        env = TimeLimit(env, steps)
    else:
        if steps is None:
            steps = env._max_episode_steps
        else:
            env._max_episode_steps = steps

    # If verbosity is not given, use 0, i.e., no console output
    if verbose is None:
        verbose = 0

    # If starting Q-value for Q-learning is not supplied, try to infer it from the environment
    if starting_q is None:
        reward_max = float(env.reward_range[1])
        starting_q = reward_max / (
            1.0 - discount) if discount < 1 else reward_max * steps

    # Initialize the agents
    agents = [
        agent.QUCBHPlusLearningAgent(env=env,
                                     name='QUCBPlus',
                                     verb=verbose,
                                     discount=discount,
                                     delta=delta,
                                     c=c,
                                     lam=lamb,
                                     omega=omega),
        agent.QUCBHLearningAgent(
            env=env,
            name='QUCB',
            verb=verbose,
            discount=discount,
            delta=delta,
            c=c,
        ),
        agent.SimpleQLearningAgent(env=env,
                                   policy=agent.policy.EpsilonGreedyPolicy(
                                       exploration_rate,
                                       exploration_rate_decay,
                                       min_exploration_rate),
                                   name='Q_max',
                                   verb=verbose,
                                   discount=discount,
                                   starting_q=starting_q)
    ]

    # Start the experiments
    if verbose >= 1:
        print(f'Starting environment {env_name}.\n')
    results = []

    # Find an exact solution by solving the underlying MDP. This solution is used in plotting
    solution = pr.solve(env, discount, steps)
    if verbose >= 1:
        print(f'Value: {solution}.\n')

    # Build a path to save directory
    path = os.path.join(
        save_dir, env_name,
        datetime.now().strftime('%Y-%m-%d-%H-%M')) if save else None

    # Run the trials
    for trial in range(trials):

        if verbose >= 1:
            print(f'Starting trial #{trial}.\n')

        # Run each agent for the given number of episodes
        for agent in agents:
            agent.reset_environment()
            agent.run(episodes)

            result = agent.get_stats()
            result = [{
                **{
                    'method': agent.name,
                    'trial': trial
                },
                **r
            } for r in result]

            if save:
                pr.save(path, env_name, result)

            results.extend(result)

        if verbose >= 1:
            print(f'Trial #{trial} done.\n')

    # Add the solution to the results file
    if save:
        solution_dict = {key: '' for key in results[0].keys()}
        solution_dict['method'] = 'Solution'
        solution_dict['trial'] = 0
        solution_dict['episode'] = 0
        solution_dict['discounted total reward'] = solution
        pr.save(path, env_name, [solution_dict])

    # Visualize the data if plotting is on
    if plot:
        results = pr.fetch_stat(results, 'total reward', episodes, trials)
        plot_quantiles = iqr is not None and 0.0 < iqr <= 1.0
        pr.plot(results,
                title='{}, d={}'.format(env_name, discount),
                solution=solution,
                episodes=episodes,
                ma=int(smoothing * episodes),
                show_q=plot_quantiles,
                iqr=iqr)
示例#8
0
hn_critic = [256, 128]

actor_dict = {"input_size": nS,
              "output_size": nA,
              "hn": hn_actor,
              "batch_norm": False}

critic_dict = {"state_size": nS,
               "action_size": nA,
               "hn": hn_critic,
               "concat_stage": 1,
               "batch_norm": False}

agent_dict = {"num_episodes": 200,
              "num_replays": 1,
              "memory_size": 2**14,
              "batchsize": 128,
              "gamma": 0.99,
              "tau": 0.001,
              "learning_rate_actor": 1E-4,
              "learning_rate_critic": 1E-4,
              "save_after": 80}


# Create agent
agent = agent.Agent(agent_dict = agent_dict, actor_dict = actor_dict, critic_dict = critic_dict)

# Train agent
agent.run(env)