def runModel(self): """ Each agent samples the realityMap and generates their own agentMap then stitch together the collectiveMap then compare the collectiveMap to the realityMap """ for agent in self.agentList: agent.run() self.makeCollectiveMap() self.calculateDifference()
def test_run(self): try: # Verify termination condition on bounded run count = run(1) self.assertEqual(count, 1) except Exception as e: self.fail(e)
def run_experiment(alpha, epsilon, gamma, selection, n_runs, trials, f=False): """ Perform n_runs experiments and visualize the results """ stats = [] # run n_trial experiments for i in range(n_runs): # Run simulation s, m, r = agent.run(alpha, epsilon, gamma, selection, trials, f) stats.append(sum(s)) # Visualize results visualize_run(s, m, r, trials, n_runs, i) print "Average success rate {0:.3f} %".format((np.mean(stats) / trials) * 100)
from common import * import tqdm import numpy as np from agent import run import gym import sys sys.path.append("..") import gym_tictactoe # Needed to add 'TicTacToe-v0' into gym registry import logging logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) # Number of cognitive cycles to execute (None -> forever) # Number of cognitive cycles to execute (None -> forever) N_STEPS = 1000 experimenting = True if __name__ == '__main__': rewards = [] for i in tqdm.tqdm(range(100)): environment = gym.make('TicTacToe-v0') environment.reset() _, reward = run(environment, n=N_STEPS, render=not experimenting) rewards.append(reward) print(np.mean(rewards))
import agent if __name__ == '__main__': # Perform exhaustive search over all paramter combinations # Save parameters and scores into csv file for easy sorting and plotting # Declare parameter ranges to search over sigmoid_offset_range = [4., 6., 8.] sigmoid_rate_range = [10**i for i in range(-3, 0)] alpha_decay_range = [0.1, 0.5, 0.9] gamma_range = [0.1, 0.5, 0.9] csv_string = 'sigmoid_offset,sigmoid_rate,alpha_decay,gamma,score\n' score = -1. # Sweep over all paramter combinations for sigmoid_offset in sigmoid_offset_range: for sigmoid_rate in sigmoid_rate_range: for alpha_decay in alpha_decay_range: for gamma in gamma_range: # Run trials with given parameter combination score = agent.run(sigmoid_offset=sigmoid_offset, sigmoid_rate=sigmoid_rate, alpha_decay=alpha_decay, gamma=gamma) csv_string += '%f,%f,%f,%f,%.4f\n' % (sigmoid_offset,sigmoid_rate,alpha_decay,gamma,score) print 'Hyper-parameter search status: %f,%f,%f,%f,%.4f' % (sigmoid_offset,sigmoid_rate,alpha_decay,gamma,score) # [debug] # Write results to hyper_params.csv fo = open('hyper_params.csv', 'wb') fo.write(csv_string); fo.close()
from agent import run search_values = [0.01, 0.03, 0.05, 0.07, 0.1, 0.3, 0.5, 0.7] for alpha in search_values: for gamma in search_values: for epsilon in search_values: run(alpha, gamma, epsilon)
def run(env: Union[Env, str], trials: int = 1, episodes: int = 10000, steps: Optional[int] = None, discount: float = 1.0, starting_q: Optional[float] = None, exploration_rate: float = 1.0, exploration_rate_decay: float = 0.99, min_exploration_rate: float = 0.00, delta: float = 0.001, c: float = 0.0, lamb: float = 1.0, omega: float = 0.8, verbose: Optional[int] = None, save: bool = True, save_dir: str = 'results', plot: bool = True, smoothing: float = 0.05, iqr: float = 0.0): """ Runs three agents (UCB-H+, UCB, and Q-Learning) in a given environment :param env: Environment: either an OpenAI Gym environment or a string; in the latter case gym.make(env) will be used. :param trials: Number of trials to run :param episodes: Number of episodes in each trial :param steps: Number of time steps per episode :param discount: Discounting factor :param starting_q: Initial Q-values for Q-Learning. UCB-H+ and UCB-H infer these from the environment :param exploration_rate: Initial exploration rate for Q-Learning :param exploration_rate_decay: Exploration rate decay for Q-Learning; each time step the exploration rate is multiplied by this until it reaches the minimum exploration rate :param min_exploration_rate: Minimum exploration rate for Q-Learning :param delta: PAC-probability delta for UCB-H and UCB-H+ :param c: UCB-constant c for UCB-H and UCB-H+ :param lamb: lambda coefficient for UCB-H+; this is added to the numerator and demoninator of the learning rate :param omega: power coefficient for UCB-H+ :param verbose: how much information to output to console: from 0 (None) to 4 (a lot of information) :param save: whether to save the experiment data or not :param save_dir: directory where the data will be saved :param plot: whether to plot the experiment data or not :param smoothing: moving-average smoothing relative to the number of episodes :param iqr: inter-quantile range for plotting """ import agent import environment # this is required for custom environments to show up in the OpenAI Gym registry # Initialize the environment an make it a TimeLimit environment for episodic learning if isinstance(env, str): env_name = env env = make(env_name) else: env_name = type(env).__name__ if not isinstance(env, TimeLimit): assert steps is not None, 'The number of steps per episode is not given' env = TimeLimit(env, steps) else: if steps is None: steps = env._max_episode_steps else: env._max_episode_steps = steps # If verbosity is not given, use 0, i.e., no console output if verbose is None: verbose = 0 # If starting Q-value for Q-learning is not supplied, try to infer it from the environment if starting_q is None: reward_max = float(env.reward_range[1]) starting_q = reward_max / ( 1.0 - discount) if discount < 1 else reward_max * steps # Initialize the agents agents = [ agent.QUCBHPlusLearningAgent(env=env, name='QUCBPlus', verb=verbose, discount=discount, delta=delta, c=c, lam=lamb, omega=omega), agent.QUCBHLearningAgent( env=env, name='QUCB', verb=verbose, discount=discount, delta=delta, c=c, ), agent.SimpleQLearningAgent(env=env, policy=agent.policy.EpsilonGreedyPolicy( exploration_rate, exploration_rate_decay, min_exploration_rate), name='Q_max', verb=verbose, discount=discount, starting_q=starting_q) ] # Start the experiments if verbose >= 1: print(f'Starting environment {env_name}.\n') results = [] # Find an exact solution by solving the underlying MDP. This solution is used in plotting solution = pr.solve(env, discount, steps) if verbose >= 1: print(f'Value: {solution}.\n') # Build a path to save directory path = os.path.join( save_dir, env_name, datetime.now().strftime('%Y-%m-%d-%H-%M')) if save else None # Run the trials for trial in range(trials): if verbose >= 1: print(f'Starting trial #{trial}.\n') # Run each agent for the given number of episodes for agent in agents: agent.reset_environment() agent.run(episodes) result = agent.get_stats() result = [{ **{ 'method': agent.name, 'trial': trial }, **r } for r in result] if save: pr.save(path, env_name, result) results.extend(result) if verbose >= 1: print(f'Trial #{trial} done.\n') # Add the solution to the results file if save: solution_dict = {key: '' for key in results[0].keys()} solution_dict['method'] = 'Solution' solution_dict['trial'] = 0 solution_dict['episode'] = 0 solution_dict['discounted total reward'] = solution pr.save(path, env_name, [solution_dict]) # Visualize the data if plotting is on if plot: results = pr.fetch_stat(results, 'total reward', episodes, trials) plot_quantiles = iqr is not None and 0.0 < iqr <= 1.0 pr.plot(results, title='{}, d={}'.format(env_name, discount), solution=solution, episodes=episodes, ma=int(smoothing * episodes), show_q=plot_quantiles, iqr=iqr)
hn_critic = [256, 128] actor_dict = {"input_size": nS, "output_size": nA, "hn": hn_actor, "batch_norm": False} critic_dict = {"state_size": nS, "action_size": nA, "hn": hn_critic, "concat_stage": 1, "batch_norm": False} agent_dict = {"num_episodes": 200, "num_replays": 1, "memory_size": 2**14, "batchsize": 128, "gamma": 0.99, "tau": 0.001, "learning_rate_actor": 1E-4, "learning_rate_critic": 1E-4, "save_after": 80} # Create agent agent = agent.Agent(agent_dict = agent_dict, actor_dict = actor_dict, critic_dict = critic_dict) # Train agent agent.run(env)