def experiment2(): np.random.seed(3) print('mushroom :') # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon, ) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) dataset = collect_dataset.get() return agent.Q.table
def experiment(policy, value): np.random.seed(45) # MDP mdp = generate_taxi('tests/taxi/grid.txt', rew=(0, 1, 5)) # Policy pi = policy(Parameter(value=value)) # Agent learning_rate = Parameter(value=.15) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = SARSA(pi, mdp.info, agent_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) # Train n_steps = 2000 core.learn(n_steps=n_steps, n_steps_per_fit=1, quiet=True) return np.sum(np.array(collect_dataset.get())[:, 2]) / float(n_steps)
def experiment(algorithm_class, decay_exp): np.random.seed() # MDP mdp = GridWorldVanHasselt() # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = algorithm_class(pi, mdp.info, **algorithm_params) # Algorithm start = mdp.convert_to_int(mdp._start, mdp._width) collect_max_Q = CollectMaxQ(agent.approximator, start) collect_dataset = CollectDataset() callbacks = [collect_dataset, collect_max_Q] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get_values() return reward, max_Qs
def experiment_others(alg, decay_exp): np.random.seed() # MDP grid_map = "simple_gridmap.txt" mdp = GridWorldGenerator(grid_map=grid_map) # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent alpha = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=alpha) fit_params = dict() agent_params = {'algorithm_params': algorithm_params, 'fit_params': fit_params} agent = alg(pi, mdp.info, agent_params) # Algorithm collect_max_Q = CollectMaxQ(agent.Q, mdp.convert_to_int(mdp._start, mdp._width)) collect_dataset = CollectDataset() callbacks = [collect_max_Q, collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get_values() return reward, max_Qs
def experiment(n_epochs, n_episodes): np.random.seed() # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 10 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size,) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(policy, mu, mdp.info, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train dataset_callback = CollectDataset() visualization_callback = Display(agent._V, mu, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, phi) core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.0) dataset_callback.clean() visualization_callback() print('Mean Reward at iteration ' + str(i) + ': ' + str(np.sum(J) / n_steps / n_episodes)) print('Press a button to visualize the pendulum...') input() sigma = 1e-8 * np.eye(1) policy.set_sigma(sigma) core.evaluate(n_steps=n_steps, render=True)
def experiment(decay_exp, windowed, tol): np.random.seed() # MDP mdp = GridWorldVanHasselt() # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent alpha = ExponentialDecayParameter(value=1, decay_exp=decay_exp, size=mdp.info.size) if windowed: beta = WindowedVarianceIncreasingParameter(value=1, size=mdp.info.size, tol=tol, window=50) else: beta = VarianceIncreasingParameter(value=1, size=mdp.info.size, tol=tol) algorithm_params = dict(learning_rate=alpha, beta=beta, off_policy=True) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = RQLearning(pi, mdp.info, agent_params) # Algorithm collect_max_Q = CollectMaxQ(agent.Q, mdp.convert_to_int(mdp._start, mdp._width)) collect_dataset = CollectDataset() callbacks = [collect_max_Q, collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=10000, n_steps_per_fit=1, quiet=True) _, _, reward, _, _, _ = parse_dataset(collect_dataset.get()) max_Qs = collect_max_Q.get_values() return reward, max_Qs
def experiment(mdp, agent_high, agent_low, n_epochs, n_episodes, ep_per_eval, ep_per_fit_low, display, print_j, quiet): np.random.seed() dataset_callback = CollectDataset() computational_graph = build_computational_graph(mdp, agent_low, agent_high, ep_per_fit_low, [dataset_callback]) core = HierarchicalCore(computational_graph) J_list = list() L_list = list() dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) J_low_list = list() L = episodes_length(dataset) L_list.append(np.mean(L)) if print_j: print('Reward at start :', J_list[-1]) for n in range(n_epochs): core.learn(n_episodes=n_episodes, skip=True, quiet=quiet) ll_dataset = dataset_callback.get() dataset_callback.clean() J_low = compute_J(ll_dataset, mdp.info.gamma) J_low_list.append(np.mean(J_low)) if print_j: print('Low level reward at epoch', n, ':', np.mean(J_low)) dataset = core.evaluate(n_episodes=ep_per_eval, quiet=quiet) J = compute_J(dataset, gamma=mdp.info.gamma) J_list.append(np.mean(J)) L = episodes_length(dataset) L_list.append(np.mean(L)) if print_j: print('Reward at epoch ', n, ':', J_list[-1]) if display: core.evaluate(n_episodes=1, render=True) return J_list, L_list, J_low_list
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit): np.random.seed() # MDP mdp = Segway() # Policy approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) n_weights = approximator.weights_size mu = np.zeros(n_weights) sigma = 2e-0 * np.ones(n_weights) policy = DeterministicPolicy(approximator) dist = GaussianDiagonalDistribution(mu, sigma) agent = alg(dist, policy, mdp.info, **params) # Train print(alg.__name__) dataset_callback = CollectDataset() core = Core(agent, mdp, callbacks=[dataset_callback]) for i in range(n_epochs): core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit, render=False) J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma) dataset_callback.clean() p = dist.get_parameters() print('mu: ', p[:n_weights]) print('sigma: ', p[n_weights:]) print('Reward at iteration ' + str(i) + ': ' + str(np.mean(J))) print('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True)
def experiment(alpha): np.random.seed() # MDP mdp = Gym(name='MountainCar-v0', horizon=np.inf, gamma=1.) # Policy epsilon = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(alpha) tilings = Tiles.generate(10, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda': .9} fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = TrueOnlineSARSALambda(pi, mdp.info, agent_params, features) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks=callbacks) # Train core.learn(n_episodes=20, n_steps_per_fit=1, render=0) dataset = collect_dataset.get() return np.mean(compute_J(dataset, 1.))
def experiment2(): np.random.seed(3) print('mushroom :') mdp = GridWorldVanHasselt() # Policy epsilon = ExponentialDecayParameter(value=1, decay_exp=.5, size=mdp.info.observation_space.size) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=1., size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = QLearning(pi, mdp.info, agent_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=2000, n_steps_per_fit=1, quiet=True) # Train dataset = collect_dataset.get() VisualizeControlBlock(dataset) return agent.Q.table
def experiment(policy, name, alg_version): np.random.seed() # MDP if name == "Taxi": mdp = generate_taxi('../grid.txt') max_steps = 100000 evaluation_frequency = 2000 test_samples = 10000 elif name == "NChain-v0": mdp = generate_chain(horizon=1000) max_steps = 5000 evaluation_frequency = 100 test_samples = 10000 elif name == "Loop": mdp = generate_loop(horizon=1000) max_steps = 5000 evaluation_frequency = 100 test_samples = 10000 elif name == "SixArms": mdp = generate_arms(horizon=1000) max_steps = 25000 evaluation_frequency = 500 test_samples = 10000 elif name == "RiverSwim": mdp = generate_river(horizon=1000) max_steps = 5000 evaluation_frequency = 100 test_samples = 10000 else: raise NotImplementedError # Policy # epsilon = ExponentialDecayParameter(value=1., decay_exp=.5, # size=mdp.info.observation_space.size) epsilon_train = ExponentialDecayParameter( value=1., decay_exp=.5, size=mdp.info.observation_space.size) epsilon_test = Parameter(0) pi = policy(epsilon=epsilon_train) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=.2, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = QLearning(pi, mdp.info, **algorithm_params) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks) scores = list() scores_train = list() # Train for n_epoch in range(1, max_steps // evaluation_frequency + 1): print('- Learning:') # learning step pi.set_epsilon(epsilon_train) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=False) dataset = collect_dataset.get() if name == "Taxi": scores_train.append(get_stats(dataset)) elif name in ["SixArms"]: scores_train.append(compute_scores_Loop(dataset, horizon=500)) else: scores_train.append(compute_scores_Loop(dataset)) collect_dataset.clean() mdp.reset() print('- Evaluation:') # evaluation step pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=test_samples, quiet=False) mdp.reset() scores.append(get_stats(dataset)) #np.save(env + '/'+alg_version+'_scores.npy', scores) return scores_train, scores
tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) # Agent learning_rate = Parameter(.1 / n_tilings) algorithm_params = {'learning_rate': learning_rate, 'lambda': .9} fit_params = dict() agent_params = { 'approximator_params': approximator_params, 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = SARSALambdaContinuous(LinearApproximator, pi, mdp.info, agent_params, features) # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] core = Core(agent, mdp, callbacks=callbacks) # Train core.learn(n_episodes=100, n_steps_per_fit=1) # Evaluate core.evaluate(n_episodes=1, render=True)
def experiment(): np.random.seed() # Model Block mdp = ShipSteeringMultiGate() #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') # Function Block 1 function_block1 = fBlock(name='f1 (angle difference)', phi=phi) # Function Block 2 function_block2 = squarednormBlock(name='f2 (squared norm)') # Function Block 3 function_block3 = addBlock(name='f3 (summation)') #Features features = Features(basis_list=[PolynomialBasis()]) # Policy 1 sigma1 = np.array([38, 38]) approximator1 = Regressor(LinearApproximator, input_shape=(features.size, ), output_shape=(2, )) approximator1.set_weights(np.array([75, 75])) pi1 = DiagonalGaussianPolicy(mu=approximator1, sigma=sigma1) # Policy 2 sigma2 = Parameter(value=.01) approximator2 = Regressor(LinearApproximator, input_shape=(1, ), output_shape=mdp.info.action_space.shape) pi2 = GaussianPolicy(mu=approximator2, sigma=sigma2) # Agent 1 learning_rate = AdaptiveParameter(value=10) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space, action_space=spaces.Box(0, 150, (2, )), gamma=mdp.info.gamma, horizon=50) agent1 = GPOMDP(policy=pi1, mdp_info=mdp_info_agent1, params=agent_params, features=features) # Agent 2 learning_rate = AdaptiveParameter(value=.001) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } mdp_info_agent2 = MDPInfo(observation_space=spaces.Box( -np.pi, np.pi, (1, )), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=100) agent2 = GPOMDP(policy=pi2, mdp_info=mdp_info_agent2, params=agent_params, features=None) # Control Block 1 parameter_callback1 = CollectPolicyParameter(pi1) control_block1 = ControlBlock(name='Control Block 1', agent=agent1, n_eps_per_fit=5, callbacks=[parameter_callback1]) # Control Block 2 dataset_callback = CollectDataset() parameter_callback2 = CollectPolicyParameter(pi2) control_block2 = ControlBlock( name='Control Block 2', agent=agent2, n_eps_per_fit=10, callbacks=[dataset_callback, parameter_callback2]) #Reward Accumulator reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma, name='reward_acc') # Algorithm blocks = [ state_ph, reward_ph, control_block1, control_block2, function_block1, function_block2, function_block3, reward_acc ] #order = [0, 1, 7, 2, 4, 5, 6, 3] state_ph.add_input(control_block2) reward_ph.add_input(control_block2) control_block1.add_input(state_ph) reward_acc.add_input(reward_ph) reward_acc.add_alarm_connection(control_block2) control_block1.add_reward(reward_acc) control_block1.add_alarm_connection(control_block2) function_block1.add_input(control_block1) function_block1.add_input(state_ph) function_block2.add_input(function_block1) function_block3.add_input(function_block2) function_block3.add_input(reward_ph) control_block2.add_input(function_block1) control_block2.add_reward(function_block3) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train #dataset_learn_visual = core.learn(n_episodes=2000) dataset_learn_visual = list() for n in range(4): dataset_learn = core.learn(n_episodes=500) last_ep_dataset = pick_last_ep(dataset_learn) dataset_learn_visual += last_ep_dataset del dataset_learn # Evaluate dataset_eval = core.evaluate(n_episodes=10) # Visualize low_level_dataset = dataset_callback.get() parameter_dataset1 = parameter_callback1.get_values() parameter_dataset2 = parameter_callback2.get_values() visualize_policy_params(parameter_dataset1, parameter_dataset2) visualize_control_block(low_level_dataset, ep_count=20) visualize_ship_steering(dataset_learn_visual, name='learn', n_gates=4) visualize_ship_steering(dataset_eval, 'evaluate', n_gates=4) plt.show() return
def experiment(algorithm, name, update_mode, update_type, policy, n_approximators, q_max, q_min, lr_exp, double, file_name, out_dir, collect_qs, seed): set_global_seeds(seed) print('Using seed %s' % seed) # MDP if name == 'Taxi': mdp = generate_taxi('../../grid.txt', horizon=5000) max_steps = 500000 evaluation_frequency = 5000 test_samples = 5000 elif name == 'Chain': mdp = generate_chain(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'Loop': mdp = generate_loop(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'RiverSwim': mdp = generate_river(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'SixArms': mdp = generate_arms(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'KnightQuest': mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 else: raise NotImplementedError epsilon_test = Parameter(0) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=lr_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) if algorithm == 'ql': if policy not in ['boltzmann', 'eps-greedy']: warnings.warn( 'QL available with only boltzmann and eps-greedy policies!') policy = 'eps-greedy' if policy == 'eps-greedy': epsilon_train = ExponentialDecayParameter( value=1., decay_exp=.5, size=mdp.info.observation_space.size) pi = policy_dict[policy](epsilon=epsilon_train) else: beta_train = ExponentialDecayParameter( value=1.5 * q_max, decay_exp=.5, size=mdp.info.observation_space.size) pi = policy_dict[policy](beta=beta_train) if double: agent = DoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = QLearning(pi, mdp.info, **algorithm_params) elif algorithm == 'boot-ql': if policy not in ['boot', 'weighted']: warnings.warn( 'Bootstrapped QL available with only boot and weighted policies!' ) policy = 'boot' pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, mu=(q_max + q_min) / 2, sigma=q_max - q_min, **algorithm_params) if double: agent = BootstrappedDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = BootstrappedQLearning(pi, mdp.info, **algorithm_params) epsilon_train = Parameter(0) elif algorithm == 'particle-ql': if policy not in ['weighted', 'vpi']: warnings.warn( 'Particle QL available with only vpi and weighted policies!') policy = 'weighted' pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, update_mode=update_mode, update_type=update_type, q_max=q_max, q_min=q_min, **algorithm_params) if double: agent = ParticleDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = ParticleQLearning(pi, mdp.info, **algorithm_params) epsilon_train = Parameter(0) else: raise ValueError() # Algorithm collect_dataset = CollectDataset() collect_qs_callback = CollectQs(agent.approximator) callbacks = [collect_dataset] if collect_qs: callbacks += [collect_qs_callback] core = Core(agent, mdp, callbacks) train_scores = [] test_scores = [] for n_epoch in range(1, max_steps // evaluation_frequency + 1): # Train if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_train) if hasattr(pi, 'set_eval'): pi.set_eval(False) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True) dataset = collect_dataset.get() scores = compute_scores(dataset, mdp.info.gamma) # print('Train: ', scores) train_scores.append(scores) collect_dataset.clean() mdp.reset() if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_test) if hasattr(pi, 'set_eval'): pi.set_eval(True) dataset = core.evaluate(n_steps=test_samples, quiet=True) mdp.reset() scores = compute_scores(dataset, mdp.info.gamma) # print('Evaluation: ', scores) test_scores.append(scores) if collect_qs: qs = collect_qs_callback.get_values() if not os.path.exists(out_dir): os.makedirs(out_dir) np.save(out_dir + '/' + file_name, qs) return train_scores, test_scores
def experiment(algorithm, name, update_mode, update_type, policy, n_approximators, q_max, q_min, lr_exp, file_name, out_dir, particles, R=1, m=1, collect_qs=False, seed=0): set_global_seeds(seed) print('Using seed %s' % seed) # MDP if name == 'Taxi': mdp = generate_taxi('../../grid.txt', horizon=5000) max_steps = 500000 evaluation_frequency = 5000 test_samples = 5000 elif name == 'Chain': mdp = generate_chain(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'Loop': mdp = generate_loop(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'RiverSwim': mdp = generate_river(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'SixArms': mdp = generate_arms(horizon=100) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'KnightQuest': mdp = None try: mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) except: register( id='KnightQuest-v0', entry_point='envs.knight_quest:KnightQuest', ) mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 else: raise NotImplementedError epsilon_test = Parameter(0) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=lr_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) if algorithm == 'particle-ql': delta = 0.1 if policy not in ['weighted', 'vpi', 'ucb']: warnings.warn( 'Particle QL available with only vpi and weighted policies!') policy = 'weighted' if policy == 'ucb': pi = UCBPolicy(delta=delta, q_max=R / (1 - mdp.info.gamma)) else: pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, update_mode=update_mode, update_type=update_type, q_max=q_max, q_min=q_min, delta=delta, init_values=particles, **algorithm_params) agent = ParticleQLearning(pi, mdp.info, **algorithm_params) if policy == 'ucb': q = agent.approximator quantiles = [ i * 1. / (n_approximators - 1) for i in range(n_approximators) ] for p in range(n_approximators): if quantiles[p] >= 1 - delta: particle_bound = p break def quantile_func(state, quantile): q_list = list() for i in range(n_approximators): q_list.append(q.predict(state, idx=i)) qs = np.array(q_list) out = np.zeros(qs.shape[1]) out[:] = qs[particle_bound, :] return out def mu(state): q_list = list() for i in range(n_approximators): q_list.append(q.predict(state, idx=i)) qs = np.array(q_list) return np.mean(qs, axis=0) pi.set_quantile_func(quantile_func) pi.set_mu(mu) epsilon_train = Parameter(0) elif algorithm == 'delayed-ql': algorithm_params = dict(R=R, m=m, **algorithm_params) agent = DelayedQLearning(mdp.info, **algorithm_params) pi = agent # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] if collect_qs: collect_qs_callback = CollectQs(agent.approximator) callbacks += [collect_qs_callback] core = Core(agent, mdp, callbacks) train_scores = [] test_scores = [] for n_epoch in range(1, max_steps // evaluation_frequency + 1): # Train if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_train) if hasattr(pi, 'set_eval'): pi.set_eval(False) core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True) dataset = collect_dataset.get() scores = compute_scores(dataset, mdp.info.gamma) # print('Train: ', scores) train_scores.append(scores) collect_dataset.clean() mdp.reset() if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_test) if hasattr(pi, 'set_eval'): pi.set_eval(True) dataset = core.evaluate(n_steps=test_samples, quiet=True) mdp.reset() scores = compute_scores(dataset, mdp.info.gamma) # print('Evaluation: ', scores) test_scores.append(scores) if collect_qs: qs = collect_qs_callback.get_values() if not os.path.exists(out_dir): os.makedirs(out_dir) np.save(out_dir + '/' + file_name, qs) return train_scores, test_scores
def experiment(algorithm, name, update_mode, update_type, policy, n_approximators, q_max, q_min, lr_exp, R, log_lr, r_max_m, delayed_m, delayed_epsilon, delta, debug, double, regret_test, a, b, mbie_C, value_iterations, tolerance, file_name, out_dir, collect_qs, seed): set_global_seeds(seed) print('Using seed %s' % seed) # MDP if name == 'Taxi': mdp = generate_taxi('../grid.txt', horizon=5000, gamma=0.99) max_steps = 500000 evaluation_frequency = 5000 test_samples = 5000 elif name == 'Chain': mdp = generate_chain(horizon=100, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'Gridworld': mdp = generate_gridworld(horizon=100, gamma=0.99) max_steps = 500000 evaluation_frequency = 5000 test_samples = 1000 elif name == 'Loop': mdp = generate_loop(horizon=100, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'RiverSwim': mdp = generate_river(horizon=100, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 mbie_C = 0.4 elif name == 'SixArms': mdp = generate_arms(horizon=100, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 mbie_C = 0.8 elif name == 'ThreeArms': horizon = 100 mdp = generate_three_arms(horizon=horizon, gamma=0.99) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 elif name == 'KnightQuest': mdp = None try: mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) except: register( id='KnightQuest-v0', entry_point='envs.knight_quest:KnightQuest', ) mdp = Gym('KnightQuest-v0', gamma=0.99, horizon=10000) max_steps = 100000 evaluation_frequency = 1000 test_samples = 1000 else: raise NotImplementedError epsilon_test = Parameter(0) # Agent learning_rate = ExponentialDecayParameter(value=1., decay_exp=lr_exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) if regret_test: max_steps = int(args.max_steps_regret * 1e6) evaluation_frequency = max_steps // 100 test_samples = 1000 if debug: max_steps = 100000 evaluation_frequency = max_steps // 100 test_samples = 1000 if algorithm == 'ql': if policy not in ['boltzmann', 'eps-greedy']: warnings.warn('QL available with only boltzmann and eps-greedy policies!') policy = 'eps-greedy' if policy == 'eps-greedy': epsilon_train = ExponentialDecayParameter(value=1., decay_exp=lr_exp, size=mdp.info.observation_space.size) pi = policy_dict[policy](epsilon=epsilon_train) else: beta_train = ExponentialDecayParameter(value=1.5 * q_max, decay_exp=.5, size=mdp.info.observation_space.size) pi = policy_dict[policy](beta=beta_train) if double: agent = DoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = QLearning(pi, mdp.info, **algorithm_params) elif algorithm == 'boot-ql': if policy not in ['boot', 'weighted']: warnings.warn('Bootstrapped QL available with only boot and weighted policies!') policy = 'boot' pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, mu=(q_max + q_min) / 2, sigma=(q_max - q_min)/2, **algorithm_params) if double: agent = BootstrappedDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = BootstrappedQLearning(pi, mdp.info, **algorithm_params) epsilon_train = Parameter(0) elif algorithm == 'particle-ql': if policy not in ['weighted', 'ucb']: warnings.warn('Particle QL available with only ucb and weighted policies!') policy = 'weighted' if policy == 'ucb': pi = UCBPolicy(delta=delta, q_max=R/(1-mdp.info.gamma)) else: pi = policy_dict[policy](n_approximators=n_approximators) algorithm_params = dict(n_approximators=n_approximators, update_mode=update_mode, update_type=update_type, q_max=q_max, q_min=q_min, delta=delta, **algorithm_params) if double: agent = ParticleDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = ParticleQLearning(pi, mdp.info, **algorithm_params) epsilon_train = Parameter(0) elif algorithm == 'r-max': thr_1 = int(np.ceil((4 * mdp.info.size[0] * 1.0/(1-mdp.info.gamma) * R )**3)) algorithm_params = dict( rmax=R, s_a_threshold=r_max_m ) agent = RMaxAgent(mdp.info, **algorithm_params) pi = agent epsilon_train = Parameter(0) elif algorithm == 'mbie': algorithm_params = dict( rmax=R, C=mbie_C, value_iterations=value_iterations, tolerance=tolerance ) agent = MBIE_EB(mdp.info, **algorithm_params) pi = agent epsilon_train = Parameter(0) elif algorithm == 'delayed-ql': theoretic_m = delayed_m if regret_test: gamma = mdp.info.gamma Vmax = R / (1 - gamma) epsilon = args.delayed_ratio * Vmax delayed_epsilon = epsilon*(1-gamma) delta = 0.1 S, A = mdp.info.size theoretic_m = (1 + gamma*Vmax)**2 / (2*delayed_epsilon**2) * np.log(3*S*A/delta * (1 + S*A/(delayed_epsilon*(1-gamma)))) if debug: print("Delta:{}".format(delta)) print("R:{}".format(R)) print("Vmax:{}".format(Vmax)) print("Gamma:{}".format(mdp.info.gamma)) print("Epsilon:{}".format(epsilon)) #print("k:{}".format(k)) print("m:{}".format(theoretic_m)) print("S:{}".format(S)) print("A:{}".format(A)) input() def evaluate_policy(P, R, policy): P_pi = np.zeros((S, S)) R_pi = np.zeros(S) for s in range(S): for s1 in range(S): P_pi[s,s1] = np.sum(policy[s, :] * P[s, :, s1]) R_pi[s] = np.sum(policy[s, :] * np.sum(P[s, :, :] * R[s, :, :], axis=-1)) I = np.diag(np.ones(S)) V = np.linalg.solve(I - gamma * P_pi, R_pi) return V algorithm_params = dict( R=R, m=theoretic_m, delta=delta, epsilon=delayed_epsilon, **algorithm_params) agent = DelayedQLearning(mdp.info, **algorithm_params) if regret_test: collect_vs_callback = CollectVs(mdp, agent, evaluate_policy, args.freq_collection) if debug: print("Q:") print(agent.get_approximator()[:, :]) print("Policy:") print(agent.get_policy()) print("V:{}".format(evaluate_policy(mdp.p,mdp.r,agent.get_policy()))) input() pi = agent epsilon_train = Parameter(0) elif algorithm == 'gaussian-ql': if policy not in ['weighted-gaussian', 'ucb']: warnings.warn('Particle QL available with only ucb and weighted policies!') policy = 'weighted-gaussian' if policy == 'ucb': pi = UCBPolicy(delta=delta, q_max=R/(1-mdp.info.gamma)) else: pi = policy_dict[policy]() q_0 = (q_max - q_min) / 2 sigma_0 = (q_max - q_min) / np.sqrt(12) C = 2 * R / (np.sqrt(2 * np.pi) * (1 - mdp.info.gamma) * sigma_0) sigma_lr = None if log_lr: sigma_lr = LogarithmicDecayParameter(value=1., C=C, size=mdp.info.size) init_values = (q_0, sigma_0) if regret_test: sigma_lr = None gamma = mdp.info.gamma T = max_steps S, A = mdp.info.size a = (2 + gamma) / (2 *(1 - gamma)) b = a - 1 c = 1 d = b q_max = R / (1 - gamma) standard_bound = norm.ppf(1 - delta, loc=0, scale=1) #first_fac = np.sqrt(b + T) #second_fac = np.sqrt(a * np.log(S*A*T / delta)) #sigma2_factor = min(np.sqrt(b + T), np.sqrt(a * np.log(S*A*T / delta))) q_0 = q_max sigma1_0 = 0 #sigma2_0 = (R + gamma * q_max) / (standard_bound * np.sqrt(c-1)) * sigma2_factor sigma2_0 = (gamma * q_max) / (c * standard_bound) * np.sqrt(a * np.log(S * A * T / delta)) init_values = (q_0, sigma1_0, sigma2_0) learning_rate = TheoreticalParameter(a=a, b=b, decay_exp=1, size=mdp.info.size) learning_rate_sigma1 = TheoreticalParameter(a=a, b=b, decay_exp=1, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate, sigma_1_learning_rate=learning_rate_sigma1) sigma_lr = BetaParameter(c=c, d=d, size=mdp.info.size) def evaluate_policy(P, R, policy): P_pi = np.zeros((S, S)) R_pi = np.zeros(S) for s in range(S): for s1 in range(S): P_pi[s,s1] = np.sum(policy[s, :] * P[s, :, s1]) R_pi[s] = np.sum(policy[s, :] * np.sum(P[s, :, :] * R[s, :, :],axis=-1)) I = np.diag(np.ones(S)) V = np.linalg.solve(I - gamma * P_pi, R_pi) return V if debug: print("Delta:{}".format(delta)) print("R:{}".format(R)) print("Gamma:{}".format(mdp.info.gamma)) print("mu0:{}".format(q_0)) print("Sigma1_0:{}".format(sigma1_0)) print("Sigma2_0:{}".format(sigma2_0)) print("a:{}".format(a)) print("b:{}".format(b)) print("c:{}".format(c)) print("d:{}".format(d)) print("T:{}".format(T)) print("S:{}".format(S)) print("A:{}".format(A)) input() algorithm_params = dict( update_mode=update_mode, update_type=update_type, sigma_learning_rate=sigma_lr, init_values=init_values, delta=delta, q_max=q_max, **algorithm_params) if double and not regret_test: agent = GaussianDoubleQLearning(pi, mdp.info, **algorithm_params) else: agent = GaussianQLearning(pi, mdp.info, **algorithm_params) if regret_test: if debug: freq = 10 else: freq = args.freq_collection collect_vs_callback = CollectVs(mdp, agent, evaluate_policy, freq) if debug: print("Policy:") print(agent.get_policy()) print("Q") for state in range(S): means = np.array(agent.approximator.predict(np.array([state]), idx=0)) sigmas1 = np.array(agent.approximator.predict(np.array([state]), idx=1)) sigmas2 = np.array(agent.approximator.predict(np.array([state]), idx=2)) print("Means:{}".format(means)) print("Sigmas1:{}".format(sigmas1)) print("Sigmas2:{}".format(sigmas2)) print("V:{}".format(evaluate_policy(mdp.p,mdp.r,agent.get_policy()))) input() if policy == 'ucb': q = agent.approximator standard_bound = norm.ppf(1 - delta, loc=0, scale=1) def quantile_func(state): means = np.array(q.predict(state, idx=0)) if regret_test: sigmas1 = np.array(q.predict(state, idx=1)) sigmas2 = np.array(q.predict(state, idx=2)) sigmas = sigmas2 #print(sigmas1, sigmas2) else: sigmas = np.array(q.predict(state, idx=1)) out = sigmas * standard_bound + means return out def mu(state): q_list = q.predict(state, idx=0) means = np.array(q_list) return means pi.set_quantile_func(quantile_func) pi.set_mu(mu) epsilon_train = Parameter(0) else: raise ValueError() # Algorithm collect_dataset = CollectDataset() callbacks = [collect_dataset] if collect_qs: if algorithm not in ['r-max']: collect_qs_callback = CollectQs(agent.approximator) callbacks += [collect_qs_callback] if regret_test: callbacks += [collect_vs_callback] core = Core(agent, mdp, callbacks) train_scores = [] test_scores = [] for n_epoch in range(1, max_steps // evaluation_frequency + 1): # Train if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_train) if hasattr(pi, 'set_eval'): pi.set_eval(False) if regret_test: collect_vs_callback.on() core.learn(n_steps=evaluation_frequency, n_steps_per_fit=1, quiet=True) dataset = collect_dataset.get() scores = compute_scores(dataset, mdp.info.gamma) #print('Train: ', scores) train_scores.append(scores) collect_dataset.clean() mdp.reset() if regret_test: vs = collect_vs_callback.get_values() if not os.path.exists(out_dir): os.makedirs(out_dir) print("Finished {} steps.".format(n_epoch * evaluation_frequency)) np.save(out_dir + "/vs_" + algorithm+"_"+str(seed), vs) np.save(out_dir+"/scores_online" + str(seed), train_scores) collect_vs_callback.off() if hasattr(pi, 'set_epsilon'): pi.set_epsilon(epsilon_test) if hasattr(pi, 'set_eval'): pi.set_eval(True) dataset = core.evaluate(n_steps=test_samples, quiet=True) s = mdp.reset() scores = compute_scores(dataset, mdp.info.gamma) print('Evaluation #%d:%s ' %(n_epoch, scores)) if debug: print("Policy:") print(agent.get_policy()) print("Q") for state in range(S): means = np.array(agent.approximator.predict(np.array([state]), idx=0)) sigmas1 = np.array(agent.approximator.predict(np.array([state]), idx=1)) sigmas2 = np.array(agent.approximator.predict(np.array([state]), idx=2)) print("Means:{}".format(means)) print("Sigmas1:{}".format(sigmas1)) print("Sigmas2:{}".format(sigmas2)) print("V:{}".format(evaluate_policy(mdp.p, mdp.r, agent.get_policy()))) input() test_scores.append(scores) if regret_test: np.save(out_dir + "/scores_offline" + str(seed), test_scores) if collect_qs: qs= collect_qs_callback.get_values() if not os.path.exists(out_dir): os.makedirs(out_dir) np.save(out_dir + '/' + file_name, qs) return train_scores, test_scores