def experiment_others(alg, decay_exp): np.random.seed() # MDP grid_map = "simple_gridmap.txt" mdp = GridWorldGenerator(grid_map=grid_map) # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent alpha = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=alpha) fit_params = dict() agent_params = {'algorithm_params': algorithm_params, 'fit_params': fit_params} agent = alg(pi, mdp.info, agent_params) # Algorithm collect_max_Q = CollectMaxQ(agent.Q, mdp.convert_to_int(mdp._start, mdp._width)) collect_dataset = CollectDataset() callbacks = [collect_max_Q, collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get_values() return reward, max_Qs
def experiment(algorithm_class, decay_exp): np.random.seed() # MDP mdp = GridWorldVanHasselt() # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = algorithm_class(pi, mdp.info, **algorithm_params) # Algorithm start = mdp.convert_to_int(mdp._start, mdp._width) collect_max_Q = CollectMaxQ(agent.approximator, start) collect_dataset = CollectDataset() callbacks = [collect_dataset, collect_max_Q] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get_values() return reward, max_Qs
def experiment(decay_exp, windowed, tol): np.random.seed() # MDP mdp = GridWorldVanHasselt() # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent alpha = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size) if windowed: beta = WindowedVarianceIncreasingParameter(value=1, size=mdp.info.size, tol=tol, window=50) else: beta = VarianceIncreasingParameter(value=1, size=mdp.info.size, tol=tol) algorithm_params = dict(learning_rate=alpha, beta=beta, off_policy=True) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = RQLearning(pi, mdp.info, agent_params) # Algorithm collect_max_Q = CollectMaxQ(agent.Q, mdp.convert_to_int(mdp._start, mdp._width)) collect_dataset = CollectDataset() callbacks = [collect_max_Q, collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get_values() return reward, max_Qs