Пример #1
0
action_scale = {'beta':3/8, 'phi':pi/8}
to_learn = {'beta':True, 'phi':True}

train_batch_size = 10
eval_batch_size = 1000

learn_residuals = True

train_episode_length = lambda x: env_kwargs['T']
eval_episode_length = lambda x: env_kwargs['T']

# Create drivers for data collection
from rl_tools.agents import dynamic_episode_driver_sim_env

collect_driver = dynamic_episode_driver_sim_env.DynamicEpisodeDriverSimEnv(
    env_kwargs, reward_kwargs, train_batch_size, action_script, action_scale, 
    to_learn, train_episode_length, learn_residuals)

eval_driver = dynamic_episode_driver_sim_env.DynamicEpisodeDriverSimEnv(
    env_kwargs, reward_kwargs_eval, eval_batch_size, action_script, action_scale, 
    to_learn, eval_episode_length, learn_residuals)


PPO.train_eval(
        root_dir = root_dir,
        random_seed = 4,
        num_epochs = 300,
        # Params for train
        normalize_observations = True,
        normalize_rewards = False,
        discount_factor = 1.0,
    'stabilizer_translations': [sqrt(pi) + 0j, 2j * sqrt(pi)]
}

# Params for action wrapper
action_script = 'v1_phase_estimation_X_prep_4round'
action_scale = {'alpha': 1, 'beta': 1, 'phi': pi}
to_learn = {'alpha': True, 'beta': False, 'phi': True}

train_batch_size = 1000
eval_batch_size = 1000

# Create drivers for data collection
from rl_tools.agents import dynamic_episode_driver_sim_env

collect_driver = dynamic_episode_driver_sim_env.DynamicEpisodeDriverSimEnv(
    env_kwargs, reward_kwargs, train_batch_size, action_script, action_scale,
    to_learn)

eval_driver = dynamic_episode_driver_sim_env.DynamicEpisodeDriverSimEnv(
    env_kwargs, reward_kwargs, eval_batch_size, action_script, action_scale,
    to_learn)

PPO.train_eval(
    root_dir=root_dir,
    random_seed=0,
    num_epochs=10000,
    # Params for train
    normalize_observations=True,
    normalize_rewards=False,
    discount_factor=1.0,
    lr=1e-4,