def run_experiment(variant, reporter): env = get_environment('gym', 'MultiGoal', 'Default', { 'actuation_cost_coeff': 1, 'distance_cost_coeff': 0.1, 'goal_reward': 1, 'init_sigma': 0.1, }) pool = SimpleReplayPool( observation_space=env.observation_space, action_space=env.action_space, max_size=1e6) sampler = SimpleSampler( max_path_length=30, min_pool_size=100, batch_size=64) Qs = get_Q_function_from_variant(variant, env) policy = get_policy_from_variant(variant, env, Qs) plotter = QFPolicyPlotter( Q=Qs[0], policy=policy, obs_lst=np.array(((-2.5, 0.0), (0.0, 0.0), (2.5, 2.5), (-2.5, -2.5))), default_action=(np.nan, np.nan), n_samples=100) algorithm = SAC( sampler=sampler, reparameterize=True, epoch_length=100, n_epochs=1000, n_train_repeat=1, eval_render_mode=None, eval_n_episodes=10, eval_deterministic=False, env=env, policy=policy, initial_exploration_policy=None, pool=pool, Qs=Qs, plotter=plotter, lr=3e-4, target_entropy=-2.0, discount=0.99, tau=1e-4, save_full_state=True, ) initialize_tf_variables(algorithm._session, only_uninitialized=True) for train_result in algorithm.train(): reporter(**train_result)
def run_experiment(variant, reporter): training_environment = (get_environment( 'gym', 'MultiGoal', 'Default-v0', { 'actuation_cost_coeff': 30, 'distance_cost_coeff': 1, 'goal_reward': 10, 'init_sigma': 0.1, })) evaluation_environment = training_environment.copy() pool = SimpleReplayPool(environment=training_environment, max_size=1e6) sampler = SimpleSampler(max_path_length=30) variant['Q_params']['config'].update({ 'input_shapes': ( training_environment.observation_shape, training_environment.action_shape, ) }) Qs = value_functions.get(variant['Q_params']) variant['policy_params']['config'].update({ 'action_range': (training_environment.action_space.low, training_environment.action_space.high), 'input_shapes': training_environment.observation_shape, 'output_shape': training_environment.action_shape, }) policy = policies.get(variant['policy_params']) plotter = QFPolicyPlotter(Q=Qs[0], policy=policy, obs_lst=np.array(((-2.5, 0.0), (0.0, 0.0), (2.5, 2.5), (-2.5, -2.5))), default_action=(np.nan, np.nan), n_samples=100) variant['algorithm_params']['config'].update({ 'training_environment': training_environment, 'evaluation_environment': evaluation_environment, 'policy': policy, 'Qs': Qs, 'pool': pool, 'sampler': sampler, 'min_pool_size': 100, 'batch_size': 64, 'plotter': plotter, }) algorithm = algorithms.get(variant['algorithm_params']) for train_result in algorithm.train(): reporter(**train_result)
def run_experiment(variant, reporter): training_environment = ( get_environment('gym', 'MultiGoal', 'Default-v0', { 'actuation_cost_coeff': 30, 'distance_cost_coeff': 1, 'goal_reward': 10, 'init_sigma': 0.1, })) evaluation_environment = training_environment.copy() pool = SimpleReplayPool( environment=training_environment, max_size=1e6) sampler = SimpleSampler(max_path_length=30) Qs = get_Q_function_from_variant(variant, training_environment) policy = get_policy_from_variant(variant, training_environment) plotter = QFPolicyPlotter( Q=Qs[0], policy=policy, obs_lst=np.array(((-2.5, 0.0), (0.0, 0.0), (2.5, 2.5), (-2.5, -2.5))), default_action=(np.nan, np.nan), n_samples=100) algorithm = get_algorithm_from_variant( variant=variant, training_environment=training_environment, evaluation_environment=evaluation_environment, policy=policy, Qs=Qs, pool=pool, sampler=sampler, min_pool_size=100, batch_size=46, plotter=plotter, ) initialize_tf_variables(algorithm._session, only_uninitialized=True) for train_result in algorithm.train(): reporter(**train_result)