def test_collect_Q(): np.random.seed(88) mdp = GridWorld(3, 3, (2, 2)) eps = Parameter(0.1) pi = EpsGreedy(eps) alpha = Parameter(0.1) agent = SARSA(mdp.info, pi, alpha) callback_q = CollectQ(agent.Q) callback_max_q = CollectMaxQ(agent.Q, np.array([2])) core = Core(agent, mdp, callbacks=[callback_q, callback_max_q]) core.learn(n_steps=1000, n_steps_per_fit=1, quiet=True) V_test = np.array([2.4477574, 0.02246188, 1.6210059, 6.01867052]) V = callback_q.get()[-1] assert np.allclose(V[0, :], V_test) V_max = np.array([np.max(x[2, :], axis=-1) for x in callback_q.get()]) max_q = np.array(callback_max_q.get()) assert np.allclose(V_max, max_q)
def main(argv): #env = retro.make(game='MegaMan-Nes', obs_type=retro.Observations.RAM) SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) retro.data.Integrations.add_custom_path( os.path.join(SCRIPT_DIR, "integrations")) print("megamanmain" in retro.data.list_games( inttype=retro.data.Integrations.ALL)) env = RetroEnvironment('megamanmain', 5000, 0.9, obs_shape=256**3, \ obs_type=retro.Observations.RAM, \ use_restricted_actions=retro.Actions.DISCRETE,inttype=retro.data.Integrations.ALL) epsilon = Parameter(value=1.) learning_rate = Parameter(value=0.3) policy = EpsGreedy(epsilon=epsilon) agent = QLearning(env.info, policy, learning_rate) #agent = CustomSARSA(env.info, policy, learning_rate) core = Core(agent, env) core.learn(n_episodes=50, n_steps_per_fit=1) core.evaluate(n_episodes=2, render=True) # print(agent.Q.shape, file=sys.stderr) # shape = agent.Q.shape # q = np.zeros(shape) # for i in range(shape[0]): # for j in range(shape[1]): # state = np.array([i]) # action = np.array([j]) # q[i, j] = agent.Q.predict(state, action) # print(q) return 0
def experiment(): np.random.seed() # MDP mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) # Policy epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = Parameter(value=.2) algorithm_params = dict(learning_rate=learning_rate) agent = QLearning(mdp.info, pi, **algorithm_params) # Core core = Core(agent, mdp) # Initial policy Evaluation dataset = core.evaluate(n_steps=1000) J = np.mean(compute_J(dataset, mdp.info.gamma)) print('J start:', J) # Train core.learn(n_steps=10000, n_steps_per_fit=1) # Final Policy Evaluation dataset = core.evaluate(n_steps=1000) J = np.mean(compute_J(dataset, mdp.info.gamma)) print('J final:', J)
def main(argv): #env = retro.make(game='MegaMan-Nes', obs_type=retro.Observations.RAM) env = RetroEnvironment('MegaMan-Nes', 5000, 0.9, obs_shape=256**3, \ obs_type=retro.Observations.RAM, \ use_restricted_actions=retro.Actions.DISCRETE) epsilon = Parameter(value=1.) learning_rate = Parameter(value=0.3) policy = EpsGreedy(epsilon=epsilon) agent = SARSA(env.info, policy, learning_rate) #agent = CustomSARSA(env.info, policy, learning_rate) core = Core(agent, env) core.learn(n_steps=10000, n_steps_per_fit=1) core.evaluate(n_episodes=10, render=True) '''print(agent.Q.shape, file=sys.stderr) shape = agent.Q.shape q = np.zeros(shape) for i in range(shape[0]): for j in range(shape[1]): state = np.array([i]) action = np.array([j]) q[i, j] = agent.Q.predict(state, action) print(q)''' return 0
def learn(alg, alg_params): mdp = CarOnHill() np.random.seed(1) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent agent = alg(mdp.info, pi, approximator, approximator_params=approximator_params, **alg_params) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=5, n_episodes_per_fit=5) test_epsilon = Parameter(0.75) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=2) return agent, np.mean(compute_J(dataset, mdp.info.gamma))
def experiment(n_epochs, n_episodes): np.random.seed() logger = Logger(COPDAC_Q.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + COPDAC_Q.__name__) # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 10 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size, ) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(mdp.info, policy, mu, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train dataset_callback = CollectDataset() visualization_callback = Display(agent._V, mu, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, phi) core = Core(agent, mdp, callbacks_fit=[dataset_callback]) for i in trange(n_epochs, leave=False): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.0) dataset_callback.clean() visualization_callback() logger.epoch_info(i + 1, R_mean=np.sum(J) / n_steps / n_episodes) logger.info('Press a button to visualize the pendulum...') input() sigma = 1e-8 * np.eye(1) policy.set_sigma(sigma) core.evaluate(n_steps=n_steps, render=True)
def test_dataset_utils(): np.random.seed(88) mdp = GridWorld(3, 3, (2, 2)) epsilon = Parameter(value=0.) alpha = Parameter(value=0.) pi = EpsGreedy(epsilon=epsilon) agent = SARSA(mdp.info, pi, alpha) core = Core(agent, mdp) dataset = core.evaluate(n_episodes=10) J = compute_J(dataset, mdp.info.gamma) J_test = np.array([ 1.16106307e-03, 2.78128389e-01, 1.66771817e+00, 3.09031544e-01, 1.19725152e-01, 9.84770902e-01, 1.06111661e-02, 2.05891132e+00, 2.28767925e+00, 4.23911583e-01 ]) assert np.allclose(J, J_test) L = episodes_length(dataset) L_test = np.array([87, 35, 18, 34, 43, 23, 66, 16, 15, 31]) assert np.array_equal(L, L_test) dataset_ep = select_first_episodes(dataset, 3) J = compute_J(dataset_ep, mdp.info.gamma) assert np.allclose(J, J_test[:3]) L = episodes_length(dataset_ep) assert np.allclose(L, L_test[:3]) samples = select_random_samples(dataset, 2) s, a, r, ss, ab, last = parse_dataset(samples) s_test = np.array([[6.], [1.]]) a_test = np.array([[0.], [1.]]) r_test = np.zeros(2) ss_test = np.array([[3], [4]]) ab_test = np.zeros(2) last_test = np.zeros(2) assert np.array_equal(s, s_test) assert np.array_equal(a, a_test) assert np.array_equal(r, r_test) assert np.array_equal(ss, ss_test) assert np.array_equal(ab, ab_test) assert np.array_equal(last, last_test) index = np.sum(L_test[:2]) + L_test[2] // 2 min_J, max_J, mean_J, n_episodes = compute_metrics(dataset[:index], mdp.info.gamma) assert min_J == 0.0011610630703530948 assert max_J == 0.2781283894436937 assert mean_J == 0.1396447262570234 assert n_episodes == 2
def main(argv): mdp = RetroFullEnvironment( 'MegaMan-Nes', 5000, 0.9, #obs_type=retro.Observations.RAM, use_restricted_actions=retro.Actions.DISCRETE) epsilon = Parameter(value=1.) learning_rate = Parameter(value=0.3) train_frequency = 4 evaluation_frequency = 250000 target_update_frequency = 10000 initial_replay_size = 50000 #initial_replay_size = 500 max_replay_size = 500000 test_samples = 125000 max_steps = 50000000 policy = EpsGreedy(epsilon=epsilon) optimizer = {'class': Adam, 'params': dict(lr=0.00025)} approximator = KerasApproximator approximator_params = dict(network=model, input_shape=mdp.info.observation_space.shape, output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n, n_features=2048, optimizer=optimizer, loss=mean_squared_error, print_summary=True) algorithm_params = dict(batch_size=32, target_update_frequency=target_update_frequency // train_frequency, replay_memory=None, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size) agent = DQN(mdp.info, policy, approximator, approximator_params=approximator_params, **algorithm_params) core = Core(agent, mdp) #core.learn(n_steps=1000000, n_steps_per_fit=1) core.learn(n_steps=100000, n_steps_per_fit=1) core.evaluate(n_episodes=10, render=True) return 0
def experiment(): np.random.seed() # MDP mdp = CarOnHill() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Approximator approximator_params = dict(input_shape=mdp.info.observation_space.shape, n_actions=mdp.info.action_space.n, n_estimators=50, min_samples_split=5, min_samples_leaf=2) approximator = ExtraTreesRegressor # Agent algorithm_params = dict(n_iterations=20) agent = FQI(mdp.info, pi, approximator, approximator_params=approximator_params, **algorithm_params) # Algorithm core = Core(agent, mdp) # Render core.evaluate(n_episodes=1, render=True) # Train core.learn(n_episodes=1000, n_episodes_per_fit=1000) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) initial_states = np.zeros((289, 2)) cont = 0 for i in range(-8, 9): for j in range(-8, 9): initial_states[cont, :] = [0.125 * i, 0.375 * j] cont += 1 dataset = core.evaluate(initial_states=initial_states) # Render core.evaluate(n_episodes=3, render=True) return np.mean(compute_J(dataset, mdp.info.gamma))
def test_rq_learning(): pi, mdp, _ = initialize() agent = RQLearning(mdp.info, pi, Parameter(.1), beta=Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[0.32411217, 2.9698436, 0.46474438, 1.10269504], [2.99505139, 5.217031, 0.40933461, 0.37687883], [0.41942675, 0.32363486, 0., 4.68559], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q) agent = RQLearning(mdp.info, pi, Parameter(.1), delta=Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[1.04081115e-2, 5.14662188e-1, 1.73951634e-2, 1.24081875e-01], [0., 2.71, 1.73137500e-4, 4.10062500e-6], [0., 4.50000000e-2, 0., 4.68559], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q) agent = RQLearning(mdp.info, pi, Parameter(.1), off_policy=True, beta=Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[3.55204022, 4.54235939, 3.42601165, 2.95170908], [2.73877031, 3.439, 2.42031528, 2.86634531], [3.43274708, 3.8592342, 3.72637395, 5.217031], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q) agent = RQLearning(mdp.info, pi, Parameter(.1), off_policy=True, delta=Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[0.18947806, 1.57782254, 0.21911489, 1.05197011], [0.82309759, 5.217031, 0.04167492, 0.61472604], [0.23620541, 0.59828262, 1.25299991, 5.217031], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q)
def test_r_learning(): pi, mdp, _ = initialize() agent = RLearning(mdp.info, pi, Parameter(.1), Parameter(.5)) core = Core(agent, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_q = np.array([[-6.19137991, -3.9368055, -5.11544257, -3.43673781], [-2.52319391, 1.92201829, -2.77602918, -2.45972955], [-5.38824415, -2.43019918, -1.09965936, 2.04202511], [0., 0., 0., 0.]]) assert np.allclose(agent.Q.table, test_q)
def experiment(algorithm_class, exp): np.random.seed() # MDP p = np.load('chain_structure/p.npy') rew = np.load('chain_structure/rew.npy') mdp = FiniteMDP(p, rew, gamma=.9) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent learning_rate = ExponentialParameter(value=1., exp=exp, size=mdp.info.size) algorithm_params = dict(learning_rate=learning_rate) agent = algorithm_class(mdp.info, pi, **algorithm_params) # Algorithm collect_Q = CollectQ(agent.approximator) callbacks = [collect_Q] core = Core(agent, mdp, callbacks) # Train core.learn(n_steps=20000, n_steps_per_fit=1, quiet=True) Qs = collect_Q.get() return Qs
def test_true_online_sarsa_lambda(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size, ), output_shape=(mdp_continuous.info.action_space.n, ), n_actions=mdp_continuous.info.action_space.n) agent = TrueOnlineSARSALambda(mdp_continuous.info, pi, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([ -17.30427303, 0., -13.54157504, 0., -16.82373134, 0., -10.29613337, 0., -14.79470382, 0., -10.50654665, 0. ]) assert np.allclose(agent.Q.get_weights(), test_w)
def test_sarsa_lambda_continuous_linear(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size, ), output_shape=(mdp_continuous.info.action_space.n, ), n_actions=mdp_continuous.info.action_space.n) agent = SARSALambdaContinuous(mdp_continuous.info, pi, LinearApproximator, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([ -16.62627886, 0., -13.03033079, 0., -15.93237930, 0., -9.72299176, 0., -13.78884631, 0., -9.92157645, 0. ]) assert np.allclose(agent.Q.get_weights(), test_w)
def test_boltzmann(): np.random.seed(88) beta = Parameter(0.1) pi = Boltzmann(beta) Q = Table((10, 3)) Q.table = np.random.randn(10, 3) pi.set_q(Q) s = np.array([2]) a = np.array([1]) p_s = pi(s) p_s_test = np.array([0.30676679, 0.36223227, 0.33100094]) assert np.allclose(p_s, p_s_test) p_sa = pi(s, a) p_sa_test = np.array([0.36223227]) assert np.allclose(p_sa, p_sa_test) a = pi.draw_action(s) a_test = 2 assert a.item() == a_test beta_2 = LinearParameter(0.2, 0.1, 2) pi.set_beta(beta_2) p_sa_2 = pi(s, a) assert p_sa_2 < p_sa pi.update(s, a) p_sa_3 = pi(s, a) p_sa_3_test = np.array([0.33100094]) assert np.allclose(p_sa_3, p_sa_3_test)
def test_eps_greedy(): np.random.seed(88) eps = Parameter(0.1) pi = EpsGreedy(eps) Q = Table((10, 3)) Q.table = np.random.randn(10, 3) pi.set_q(Q) s = np.array([2]) a = np.array([1]) p_s = pi(s) p_s_test = np.array([0.03333333, 0.93333333, 0.03333333]) assert np.allclose(p_s, p_s_test) p_sa = pi(s, a) p_sa_test = np.array([0.93333333]) assert np.allclose(p_sa, p_sa_test) a = pi.draw_action(s) a_test = 1 assert a.item() == a_test eps_2 = LinearParameter(0.2, 0.1, 2) pi.set_epsilon(eps_2) p_sa_2 = pi(s, a) assert p_sa_2 < p_sa pi.update(s, a) pi.update(s, a) p_sa_3 = pi(s, a) print(eps_2.get_value()) assert p_sa_3 == p_sa
def test_true_online_sarsa_lambda_save(tmpdir): agent_path = tmpdir / 'agent_{}'.format(datetime.now().strftime("%H%M%S%f")) pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), n_actions=mdp_continuous.info.action_space.n ) agent_save = TrueOnlineSARSALambda(mdp_continuous.info, pi, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent_save, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) agent_save.save(agent_path) agent_load = Agent.load(agent_path) for att, method in vars(agent_save).items(): save_attr = getattr(agent_save, att) load_attr = getattr(agent_load, att) tu.assert_eq(save_attr, load_attr)
def test_lspi(): np.random.seed(1) # MDP mdp = CartPole() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] features = Features(basis_list=basis) fit_params = dict() approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(mdp.info, pi, approximator_params=approximator_params, fit_params=fit_params, features=features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=10, n_episodes_per_fit=10) w = agent.approximator.get_weights() w_test = np.array([-1.00749128, -1.13444655, -0.96620322]) assert np.allclose(w, w_test)
def default(cls, lr=.0001, network=DQNNetwork, initial_replay_size=50000, max_replay_size=1000000, batch_size=32, target_update_frequency=2500, n_steps_per_fit=1, use_cuda=False): policy = EpsGreedy(epsilon=Parameter(value=1.)) approximator_params = dict(network=network, optimizer={ 'class': optim.Adam, 'params': { 'lr': lr } }, loss=F.smooth_l1_loss, use_cuda=use_cuda) alg_params = dict(initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, batch_size=batch_size, target_update_frequency=target_update_frequency) return cls(policy, TorchApproximator, approximator_params, alg_params, n_steps_per_fit)
def test_sarsa_lambda_continuous_nn(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) features = Features( n_outputs=mdp_continuous.info.observation_space.shape[0] ) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), network=Network, n_actions=mdp_continuous.info.action_space.n ) agent = SARSALambdaContinuous(mdp_continuous.info, pi, TorchApproximator, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([-0.18968964, 0.4296857, 0.52967095, 0.5674884, -0.12784956, -0.10572472, -0.14546978, -0.67001086, -0.93925357]) assert np.allclose(agent.Q.get_weights(), test_w)
def test_true_online_sarsa_lambda(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), n_actions=mdp_continuous.info.action_space.n ) agent = TrueOnlineSARSALambda(mdp_continuous.info, pi, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([-17.27410736, 0., -15.04386343, 0., -16.6551805, 0., -11.31383707, 0., -16.11782002, 0., -9.6927357, 0.]) assert np.allclose(agent.Q.get_weights(), test_w)
def learn_lspi(): np.random.seed(1) # MDP mdp = CartPole() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] features = Features(basis_list=basis) fit_params = dict() approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(mdp.info, pi, approximator_params=approximator_params, fit_params=fit_params, features=features) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=10, n_episodes_per_fit=10) return agent
def test_sarsa_lambda_continuous_linear(): pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) n_tilings = 1 tilings = Tiles.generate(n_tilings, [2, 2], mdp_continuous.info.observation_space.low, mdp_continuous.info.observation_space.high) features = Features(tilings=tilings) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), n_actions=mdp_continuous.info.action_space.n ) agent = SARSALambdaContinuous(mdp_continuous.info, pi, LinearApproximator, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) test_w = np.array([-16.38428419, 0., -14.31250136, 0., -15.68571525, 0., -10.15663821, 0., -15.0545445, 0., -8.3683605, 0.]) assert np.allclose(agent.Q.get_weights(), test_w)
def test_sarsa_lambda_continuous_nn_save(tmpdir): agent_path = tmpdir / 'agent_{}'.format(datetime.now().strftime("%H%M%S%f")) pi, _, mdp_continuous = initialize() mdp_continuous.seed(1) features = Features( n_outputs=mdp_continuous.info.observation_space.shape[0] ) approximator_params = dict( input_shape=(features.size,), output_shape=(mdp_continuous.info.action_space.n,), network=Network, n_actions=mdp_continuous.info.action_space.n ) agent_save = SARSALambdaContinuous(mdp_continuous.info, pi, TorchApproximator, Parameter(.1), .9, features=features, approximator_params=approximator_params) core = Core(agent_save, mdp_continuous) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) agent_save.save(agent_path) agent_load = Agent.load(agent_path) for att, method in vars(agent_save).items(): save_attr = getattr(agent_save, att) load_attr = getattr(agent_load, att) tu.assert_eq(save_attr, load_attr)
def experiment(): np.random.seed() # MDP mdp = CartPole() # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent basis = [PolynomialBasis()] s1 = np.array([-np.pi, 0, np.pi]) * .25 s2 = np.array([-1, 0, 1]) for i in s1: for j in s2: basis.append(GaussianRBF(np.array([i, j]), np.array([1.]))) features = Features(basis_list=basis) fit_params = dict() approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) agent = LSPI(mdp.info, pi, approximator_params=approximator_params, fit_params=fit_params, features=features) # Algorithm core = Core(agent, mdp) core.evaluate(n_episodes=3, render=True) # Train core.learn(n_episodes=100, n_episodes_per_fit=100) # Test test_epsilon = Parameter(0.) agent.policy.set_epsilon(test_epsilon) dataset = core.evaluate(n_episodes=1, quiet=True) core.evaluate(n_steps=100, render=True) return np.mean(episodes_length(dataset))
def test_copdac_q(): n_steps = 50 mdp = InvertedPendulum(horizon=n_steps) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Agent n_tilings = 1 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [2, 2], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size, ) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(mdp.info, policy, mu, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train core = Core(agent, mdp) core.learn(n_episodes=2, n_episodes_per_fit=1) w = agent.policy.get_weights() w_test = np.array([0, -6.62180045e-7, 0, -4.23972882e-2]) assert np.allclose(w, w_test)
def experiment(alpha): np.random.seed() # MDP mdp = Gym(name='Acrobot-v1', horizon=np.inf, gamma=1.) # Policy epsilon = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon) # Agent n_tilings = 10 tilings = Tiles.generate(n_tilings, [10, 10, 10, 10, 10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high) features = Features(tilings=tilings) learning_rate = Parameter(alpha / n_tilings) approximator_params = dict(input_shape=(features.size, ), output_shape=(mdp.info.action_space.n, ), n_actions=mdp.info.action_space.n) algorithm_params = {'learning_rate': learning_rate, 'lambda_coeff': .9} agent = TrueOnlineSARSALambda(mdp.info, pi, approximator_params=approximator_params, features=features, **algorithm_params) #shape = agent.approximator.Q #print(agent.Q) # Algorithm core = Core(agent, mdp) # Train core.learn(n_episodes=10, n_steps_per_fit=1, render=True) dataset = core.evaluate(n_episodes=1, render=False) #print(dataset) print(episodes_length(dataset)) return np.mean(compute_J(dataset, .96))
def test_rq_learning_save(tmpdir): agent_path = tmpdir / 'agent_{}'.format(datetime.now().strftime("%H%M%S%f")) pi, mdp, _ = initialize() agent_save = RQLearning(mdp.info, pi, Parameter(.1), beta=Parameter(.5)) core = Core(agent_save, mdp) # Train core.learn(n_steps=100, n_steps_per_fit=1, quiet=True) agent_save.save(agent_path) agent_load = Agent.load(agent_path) for att, method in vars(agent_save).items(): save_attr = getattr(agent_save, att) load_attr = getattr(agent_load, att) tu.assert_eq(save_attr, load_attr)
def build(self, mdp_info): self.approximator_params[ 'input_shape'] = mdp_info.observation_space.shape self.approximator_params['output_shape'] = (mdp_info.action_space.n, ) self.approximator_params['n_actions'] = mdp_info.action_space.n self.epsilon = LinearParameter(value=1, threshold_value=.05, n=1000000) self.epsilon_test = Parameter(value=.01) return DoubleDQN(mdp_info, self.policy, self.approximator, self.approximator_params, **self.alg_params)
def learn(alg, alg_params): # MDP mdp = CartPole() np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) # Policy epsilon_random = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon_random) # Approximator input_shape = mdp.info.observation_space.shape approximator_params = dict( network=Network if alg is not CategoricalDQN else FeatureNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': .001 } }, loss=F.smooth_l1_loss, input_shape=input_shape, output_shape=mdp.info.action_space.size, n_actions=mdp.info.action_space.n, n_features=2, use_cuda=False) # Agent if alg not in [DuelingDQN, CategoricalDQN]: agent = alg(mdp.info, pi, TorchApproximator, approximator_params=approximator_params, **alg_params) elif alg is CategoricalDQN: agent = alg(mdp.info, pi, approximator_params=approximator_params, n_atoms=2, v_min=-1, v_max=1, **alg_params) else: agent = alg(mdp.info, pi, approximator_params=approximator_params, **alg_params) # Algorithm core = Core(agent, mdp) core.learn(n_steps=500, n_steps_per_fit=5) return agent