def experiment(n_epochs, n_episodes): np.random.seed() logger = Logger(COPDAC_Q.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + COPDAC_Q.__name__) # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 10 alpha_theta = Parameter(5e-3 / n_tilings) alpha_omega = Parameter(0.5 / n_tilings) alpha_v = Parameter(0.5 / n_tilings) tilings = Tiles.generate(n_tilings, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) input_shape = (phi.size, ) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) sigma = 1e-1 * np.eye(1) policy = GaussianPolicy(mu, sigma) agent = COPDAC_Q(mdp.info, policy, mu, alpha_theta, alpha_omega, alpha_v, value_function_features=phi, policy_features=phi) # Train dataset_callback = CollectDataset() visualization_callback = Display(agent._V, mu, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, phi) core = Core(agent, mdp, callbacks_fit=[dataset_callback]) for i in trange(n_epochs, leave=False): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.0) dataset_callback.clean() visualization_callback() logger.epoch_info(i + 1, R_mean=np.sum(J) / n_steps / n_episodes) logger.info('Press a button to visualize the pendulum...') input() sigma = 1e-8 * np.eye(1) policy.set_sigma(sigma) core.evaluate(n_steps=n_steps, render=True)
def experiment(n_epochs, n_iterations, ep_per_run, save_states_to_disk): np.random.seed() logger = Logger('plot_and_norm_example', results_dir=None) logger.strong_line() logger.info('Plotting and normalization example') # MDP mdp = LQR.generate(dimensions=2, max_pos=10., max_action=5., episodic=True) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent optimizer = AdaptiveOptimizer(eps=.01) algorithm_params = dict(optimizer=optimizer) agent = REINFORCE(mdp.info, policy, **algorithm_params) # normalization callback prepro = MinMaxPreprocessor(mdp_info=mdp.info) # plotting callback plotter = PlotDataset(mdp.info, obs_normalized=True) # Train core = Core(agent, mdp, callback_step=plotter, preprocessors=[prepro]) # training loop for n in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset = core.evaluate(n_episodes=ep_per_run, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) logger.epoch_info(n + 1, J=J) if save_states_to_disk: # save normalization / plot states to disk path logger.info('Saving plotting and normalization data') os.makedirs("./logs/plot_and_norm", exist_ok=True) prepro.save("./logs/plot_and_norm/preprocessor.msh") plotter.save_state("./logs/plot_and_norm/plotting_state") # load states from disk path logger.info('Loading preprocessor and plotter') prerpo = MinMaxPreprocessor.load( "./logs/plot_and_norm/preprocessor.msh") plotter.load_state("./logs/plot_and_norm/plotting_state")
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit): np.random.seed() print('============ start experiment ============') logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = GraspEnv() print('============ mdp ============') # Policy n_weights = 6 mu = np.array([-0.5, 0.0, 0.91, m.pi, 0, 0]) sigma = np.asarray([0.05, 0.05, 0.05, 0.1, 0.1, 0.1]) #np.asarray([0.15, 0.15, 0.15, 0.4, 0.4, 0.4]) policy = Own_policy() dist = GaussianDiagonalDistribution( mu, sigma) # TODO: is this distribution right? Yes. agent = alg(mdp.info, dist, policy, **params) # Train dataset_callback = CollectDataset( ) # TODO: should we also collect the dataset? Just keep this. core = Core(agent, mdp, callbacks_fit=[dataset_callback]) #core = Core(agent, mdp) for i in range(n_epochs): print('================ core learn ================') core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit) J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma) print('J:', J) print('============================') dataset_callback.clean() # Todo: learning curve? Done p = dist.get_parameters() print('p:', p) mu_0.append(p[:n_weights][0]) mu_1.append(p[:n_weights][1]) mu_2.append(p[:n_weights][2]) mu_3.append(p[:n_weights][3]) mu_4.append(p[:n_weights][4]) mu_5.append(p[:n_weights][5]) current_avg_sigma = (p[n_weights:][0] + p[n_weights:][1] + p[n_weights:][2] + p[n_weights:][3] + p[n_weights:][4] + p[n_weights:][5]) / 6 avg_sigma.append(current_avg_sigma) # record learning curve of cumulative rewards logger.epoch_info(i + 1, J=np.mean(J), mu=p[:n_weights], sigma=p[n_weights:]) list_J.append(np.mean(J))
def experiment(alg, env_id, horizon, gamma, n_epochs, n_steps, n_steps_per_fit, n_step_test, alg_params, policy_params): logger = Logger(A2C.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + A2C.__name__) mdp = Gym(env_id, horizon, gamma) critic_params = dict(network=Network, optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 1e-5 } }, loss=F.mse_loss, n_features=64, batch_size=64, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) alg_params['critic_params'] = critic_params policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) agent = alg(mdp.info, policy, **alg_params) core = Core(agent, mdp) dataset = core.evaluate(n_steps=n_step_test, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy() logger.epoch_info(0, J=J, R=R, entropy=E) for it in trange(n_epochs): core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) dataset = core.evaluate(n_steps=n_step_test, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy() logger.epoch_info(it + 1, J=J, R=R, entropy=E) logger.info('Press a button to visualize') input() core.evaluate(n_episodes=5, render=True)
def experiment(alg, params, n_epochs, fit_per_epoch, ep_per_fit): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = ShipSteering() # Policy high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size, ) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(approximator) mu = np.zeros(policy.weights_size) sigma = 4e-1 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) # Agent agent = alg(mdp.info, distribution, policy, features=phi, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(0, J=np.mean(J)) for i in range(n_epochs): core.learn(n_episodes=fit_per_epoch * ep_per_fit, n_episodes_per_fit=ep_per_fit) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(i + 1, J=np.mean(J))
def experiment(alg, params, n_epochs, n_episodes, n_ep_per_fit): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = Segway() # Policy approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) n_weights = approximator.weights_size mu = np.zeros(n_weights) sigma = 2e-0 * np.ones(n_weights) policy = DeterministicPolicy(approximator) dist = GaussianDiagonalDistribution(mu, sigma) agent = alg(mdp.info, dist, policy, **params) # Train dataset_callback = CollectDataset() core = Core(agent, mdp, callbacks_fit=[dataset_callback]) for i in trange(n_epochs, leave=False): core.learn(n_episodes=n_episodes, n_episodes_per_fit=n_ep_per_fit, render=False) J = compute_J(dataset_callback.get(), gamma=mdp.info.gamma) dataset_callback.clean() p = dist.get_parameters() logger.epoch_info(i + 1, J=np.mean(J), mu=p[:n_weights], sigma=p[n_weights:]) logger.info('Press a button to visualize the segway...') input() core.evaluate(n_episodes=3, render=True)
def experiment(alg, n_epochs, n_iterations, ep_per_run): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = LQR.generate(dimensions=2, max_action=1., max_pos=1.) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) sigma_weights = 0.25 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent optimizer = AdaptiveOptimizer(eps=1e-2) algorithm_params = dict(optimizer=optimizer) agent = alg(mdp.info, policy, **algorithm_params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(0, J=np.mean(J), policy_weights=policy.get_weights().tolist()) for i in trange(n_epochs, leave=False): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(i + 1, J=np.mean(J), policy_weights=policy.get_weights().tolist())
def experiment(goal, use_muscles, n_epochs, n_steps, n_episodes_test): np.random.seed(1) logger = Logger('SAC', results_dir=None) logger.strong_line() logger.info('Humanoid Experiment, Algorithm: SAC') # MDP gamma = 0.99 horizon = 2000 mdp = create_mdp(gamma, horizon, goal, use_muscles=use_muscles) # Agent agent = create_SAC_agent(mdp) # normalization callback normalizer = MinMaxPreprocessor(mdp_info=mdp.info) # plotting callback plotter = PlotDataset(mdp.info) # Algorithm(with normalization and plotting) core = Core(agent, mdp, callback_step=plotter, preprocessors=[normalizer]) dataset = core.evaluate(n_episodes=n_episodes_test, render=True) J = np.mean(compute_J(dataset, gamma)) L = int(np.round(np.mean(episodes_length(dataset)))) logger.epoch_info(0, J=J, episode_lenght=L) # training loop for n in trange(n_epochs, leave=False): core.learn(n_steps=n_steps, n_steps_per_fit=1) dataset = core.evaluate(n_episodes=n_episodes_test, render=True) J = np.mean(compute_J(dataset, gamma)) L = int(np.round(np.mean(episodes_length(dataset)))) logger.epoch_info(n+1, J=J, episode_lenght=L) logger.info('Press a button to visualize humanoid') input() core.evaluate(n_episodes=10, render=True)
def experiment(alg, params, n_epochs, fit_per_epoch, ep_per_fit): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP mdp = LQR.generate(dimensions=1) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 1e-3 * np.eye(policy.weights_size) distribution = GaussianCholeskyDistribution(mu, sigma) # Agent agent = alg(mdp.info, distribution, policy, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info(0, J=np.mean(J), distribution_parameters=distribution.get_parameters()) for i in trange(n_epochs, leave=False): core.learn(n_episodes=fit_per_epoch * ep_per_fit, n_episodes_per_fit=ep_per_fit) dataset_eval = core.evaluate(n_episodes=ep_per_fit) J = compute_J(dataset_eval, gamma=mdp.info.gamma) logger.epoch_info( i + 1, J=np.mean(J), distribution_parameters=distribution.get_parameters())
def experiment(alg, n_epochs, n_steps, n_episodes_test): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP gamma = 0.99 habitat_root_path = Habitat.root_path() config_file = os.path.join( habitat_root_path, 'habitat_baselines/config/rearrange/rl_pick.yaml') base_config_file = os.path.join(habitat_root_path, 'configs/tasks/rearrange/pick.yaml') wrapper = 'HabitatRearrangeWrapper' mdp = Habitat(wrapper, config_file, base_config_file, gamma=gamma) # Settings initial_replay_size = 64 max_replay_size = 50000 batch_size = 64 n_features = 64 warmup_transitions = 100 tau = 0.005 lr_alpha = 3e-4 use_cuda = torch.cuda.is_available() # Approximator actor_input_shape = mdp.info.observation_space.shape actor_mu_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_sigma_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}} critic_input_shape = actor_input_shape + mdp.info.action_space.shape critic_params = dict(network=CriticNetwork, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1, ), use_cuda=use_cuda) # Agent agent = alg(mdp.info, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, critic_fit_params=None) # Algorithm core = Core(agent, mdp) # RUN dataset = core.evaluate(n_episodes=n_episodes_test, render=False) s, *_ = parse_dataset(dataset) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy(s) logger.epoch_info(0, J=J, R=R, entropy=E) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) for n in trange(n_epochs, leave=False): core.learn(n_steps=n_steps, n_steps_per_fit=1) dataset = core.evaluate(n_episodes=n_episodes_test, render=False) s, *_ = parse_dataset(dataset) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy(s) logger.epoch_info(n + 1, J=J, R=R, entropy=E) logger.info('Press a button to visualize the robot') input() core.evaluate(n_episodes=5, render=True)
def experiment(alg, n_epochs, n_steps, n_steps_test): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) use_cuda = torch.cuda.is_available() # MDP horizon = 200 gamma = 0.99 mdp = Gym('Pendulum-v1', horizon, gamma) # Policy policy_class = OrnsteinUhlenbeckPolicy policy_params = dict(sigma=np.ones(1) * .2, theta=.15, dt=1e-2) # Settings initial_replay_size = 500 max_replay_size = 5000 batch_size = 200 n_features = 80 tau = .001 # Approximator actor_input_shape = mdp.info.observation_space.shape actor_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_optimizer = {'class': optim.Adam, 'params': {'lr': .001}} critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],) critic_params = dict(network=CriticNetwork, optimizer={'class': optim.Adam, 'params': {'lr': .001}}, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1,), use_cuda=use_cuda) # Agent agent = alg(mdp.info, policy_class, policy_params, actor_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, tau) # Algorithm core = Core(agent, mdp) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) # RUN dataset = core.evaluate(n_steps=n_steps_test, render=False) J = np.mean(compute_J(dataset, gamma)) R = np.mean(compute_J(dataset)) logger.epoch_info(0, J=J, R=R) for n in trange(n_epochs, leave=False): core.learn(n_steps=n_steps, n_steps_per_fit=1) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = np.mean(compute_J(dataset, gamma)) R = np.mean(compute_J(dataset)) logger.epoch_info(n+1, J=J, R=R) logger.info('Press a button to visualize pendulum') input() core.evaluate(n_episodes=5, render=True)
mdp = generate_simple_chain(state_n=5, goal_states=[2], prob=.8, rew=1, gamma=.9) epsilon = Parameter(value=.15) pi = EpsGreedy(epsilon=epsilon) agent = QLearning(mdp.info, pi, learning_rate=Parameter(value=.2)) core = Core(agent, mdp) epochs = 10 # Initial policy Evaluation logger.info('Experiment started') logger.strong_line() dataset = core.evaluate(n_steps=100) J = np.mean(compute_J(dataset, mdp.info.gamma)) # Discounted returns R = np.mean(compute_J(dataset)) # Undiscounted returns logger.epoch_info(0, J=J, R=R, any_label='any value') for i in trange(epochs): # Here some learning core.learn(n_steps=100, n_steps_per_fit=1) sleep(0.5) dataset = core.evaluate(n_steps=100) sleep(0.5) J = np.mean(compute_J(dataset, mdp.info.gamma)) # Discounted returns R = np.mean(compute_J(dataset)) # Undiscounted returns # Here logging epoch results to the console logger.epoch_info(i+1, J=J, R=R) # Logging the data in J.npy and E.npy logger.log_numpy(J=J, R=R)
def experiment(alg, n_epochs, n_steps, n_steps_test): np.random.seed() logger = Logger(alg.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + alg.__name__) # MDP horizon = 200 gamma = 0.99 mdp = Gym('Pendulum-v1', horizon, gamma) # Settings initial_replay_size = 64 max_replay_size = 50000 batch_size = 64 n_features = 64 warmup_transitions = 100 tau = 0.005 lr_alpha = 3e-4 use_cuda = torch.cuda.is_available() # Approximator actor_input_shape = mdp.info.observation_space.shape actor_mu_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_sigma_params = dict(network=ActorNetwork, n_features=n_features, input_shape=actor_input_shape, output_shape=mdp.info.action_space.shape, use_cuda=use_cuda) actor_optimizer = {'class': optim.Adam, 'params': {'lr': 3e-4}} critic_input_shape = (actor_input_shape[0] + mdp.info.action_space.shape[0],) critic_params = dict(network=CriticNetwork, optimizer={'class': optim.Adam, 'params': {'lr': 3e-4}}, loss=F.mse_loss, n_features=n_features, input_shape=critic_input_shape, output_shape=(1,), use_cuda=use_cuda) # Agent agent = alg(mdp.info, actor_mu_params, actor_sigma_params, actor_optimizer, critic_params, batch_size, initial_replay_size, max_replay_size, warmup_transitions, tau, lr_alpha, critic_fit_params=None) # Algorithm core = Core(agent, mdp) # RUN dataset = core.evaluate(n_steps=n_steps_test, render=False) s, *_ = parse_dataset(dataset) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy(s) logger.epoch_info(0, J=J, R=R, entropy=E) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) for n in trange(n_epochs, leave=False): core.learn(n_steps=n_steps, n_steps_per_fit=1) dataset = core.evaluate(n_steps=n_steps_test, render=False) s, *_ = parse_dataset(dataset) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy(s) logger.epoch_info(n+1, J=J, R=R, entropy=E) logger.info('Press a button to visualize pendulum') input() core.evaluate(n_episodes=5, render=True)
def experiment(n_epochs, n_episodes): np.random.seed() logger = Logger(StochasticAC_AVG.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + StochasticAC_AVG.__name__) # MDP n_steps = 5000 mdp = InvertedPendulum(horizon=n_steps) # Agent n_tilings = 11 alpha_r = Parameter(.0001) alpha_theta = Parameter(.001 / n_tilings) alpha_v = Parameter(.1 / n_tilings) tilings = Tiles.generate(n_tilings - 1, [10, 10], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) phi = Features(tilings=tilings) tilings_v = tilings + Tiles.generate( 1, [1, 1], mdp.info.observation_space.low, mdp.info.observation_space.high + 1e-3) psi = Features(tilings=tilings_v) input_shape = (phi.size, ) mu = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std_0 = np.sqrt(1.) std.set_weights(np.log(std_0) / n_tilings * np.ones(std.weights_size)) policy = StateLogStdGaussianPolicy(mu, std) agent = StochasticAC_AVG(mdp.info, policy, alpha_theta, alpha_v, alpha_r, lambda_par=.5, value_function_features=psi, policy_features=phi) # Train dataset_callback = CollectDataset() display_callback = Display(agent._V, mu, std, mdp.info.observation_space.low, mdp.info.observation_space.high, phi, psi) core = Core(agent, mdp, callbacks_fit=[dataset_callback]) for i in trange(n_epochs, leave=False): core.learn(n_episodes=n_episodes, n_steps_per_fit=1, render=False) J = compute_J(dataset_callback.get(), gamma=1.) dataset_callback.clean() display_callback() logger.epoch_info(i + 1, R_mean=np.sum(J) / n_steps / n_episodes) logger.info('Press a button to visualize the pendulum...') input() core.evaluate(n_steps=n_steps, render=True)
def experiment(n_epochs, n_steps, n_steps_test): np.random.seed() logger = Logger(DQN.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + DQN.__name__) # MDP horizon = 1000 gamma = 0.99 gamma_eval = 1. mdp = Gym('Acrobot-v1', horizon, gamma) # Policy epsilon = LinearParameter(value=1., threshold_value=.01, n=5000) epsilon_test = Parameter(value=0.) epsilon_random = Parameter(value=1.) pi = EpsGreedy(epsilon=epsilon_random) # Settings initial_replay_size = 500 max_replay_size = 5000 target_update_frequency = 100 batch_size = 200 n_features = 80 train_frequency = 1 # Approximator input_shape = mdp.info.observation_space.shape approximator_params = dict(network=Network, optimizer={ 'class': optim.Adam, 'params': { 'lr': .001 } }, loss=F.smooth_l1_loss, n_features=n_features, input_shape=input_shape, output_shape=mdp.info.action_space.size, n_actions=mdp.info.action_space.n) # Agent agent = DQN(mdp.info, pi, TorchApproximator, approximator_params=approximator_params, batch_size=batch_size, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, target_update_frequency=target_update_frequency) # Algorithm core = Core(agent, mdp) core.learn(n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size) # RUN pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) logger.epoch_info(0, J=np.mean(J)) for n in trange(n_epochs): pi.set_epsilon(epsilon) core.learn(n_steps=n_steps, n_steps_per_fit=train_frequency) pi.set_epsilon(epsilon_test) dataset = core.evaluate(n_steps=n_steps_test, render=False) J = compute_J(dataset, gamma_eval) logger.epoch_info(n + 1, J=np.mean(J)) logger.info('Press a button to visualize acrobot') input() core.evaluate(n_episodes=5, render=True)
def experiment(n_epochs, n_steps, n_steps_per_fit, n_step_test): np.random.seed() logger = Logger(A2C.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + A2C.__name__) # MDP horizon = 1000 gamma = 0.99 gamma_eval = 1. mdp = Gym('Acrobot-v1', horizon, gamma) # Policy policy_params = dict( n_features=32, use_cuda=False ) beta = Parameter(1e0) pi = BoltzmannTorchPolicy(Network, mdp.info.observation_space.shape, (mdp.info.action_space.n,), beta=beta, **policy_params) # Agent critic_params = dict(network=Network, optimizer={'class': optim.RMSprop, 'params': {'lr': 1e-3, 'eps': 1e-5}}, loss=F.mse_loss, n_features=32, batch_size=64, input_shape=mdp.info.observation_space.shape, output_shape=(1,)) alg_params = dict(actor_optimizer={'class': optim.RMSprop, 'params': {'lr': 1e-3, 'eps': 3e-3}}, critic_params=critic_params, ent_coeff=0.01 ) agent = A2C(mdp.info, pi, **alg_params) # Algorithm core = Core(agent, mdp) core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) # RUN dataset = core.evaluate(n_steps=n_step_test, render=False) J = compute_J(dataset, gamma_eval) logger.epoch_info(0, J=np.mean(J)) for n in trange(n_epochs): core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) dataset = core.evaluate(n_steps=n_step_test, render=False) J = compute_J(dataset, gamma_eval) logger.epoch_info(n+1, J=np.mean(J)) logger.info('Press a button to visualize acrobot') input() core.evaluate(n_episodes=5, render=True)