Exemplo n.º 1
0
 def __init__(self, env_kwargs_list, rew_kwargs_list, batch_size,
              action_script, action_scale, to_learn, episode_length_list,
              env_schedule=None):
     """
     Args:
         env_kwargs_list (list[dict]): list of parameters for training 
             environment.
         reward_kwargs_list (list[dict]): list of parameters for reward 
             functions. Should correspond to 'env_kwargs_list'.
         batch_size (int): number of episodes collected in parallel.
         action_script (str): name of action script. Action wrapper will 
             select actions from this script if they are not learned.
         action_scale (dict, str:float): dictionary mapping action dimensions
             to scaling factors. Action wrapper will rescale actions produced
             by the agent's neural net policy by these factors.
         to_learn (dict, str:bool): dictionary mapping action dimensions to 
             bool flags. Specifies if the action should be learned or scripted.
         episode_length_list (list[callable: int -> int]): list of schedule 
             functions for episode durations. Schedule functions take as 
             argument int epoch number and return int episode duration for 
             this epoch. The list should correspond to 'env_kwargs_list'.
         env_schedule (callable): function mapping epoch number to index
             of the environment from the list to use during this epoch
     """
     self.env_list, self.driver_list = [], []
     self.episode_length_list = episode_length_list
     for env_kwargs, rew_kwargs in zip(env_kwargs_list, rew_kwargs_list):
         # Create training env and wrap it
         env = env_init(batch_size=batch_size, reward_kwargs=rew_kwargs,
                        **env_kwargs)
         action_script_m = action_scripts.__getattribute__(action_script)
         env = wrappers.ActionWrapper(env, action_script_m, action_scale, 
                                      to_learn)
 
         # create dummy placeholder policy to initialize driver
         dummy_policy = PolicyPlaceholder(
             env.time_step_spec(), env.action_spec())
         
         # create driver for this environment
         driver = dynamic_episode_driver.DynamicEpisodeDriver(
             env, dummy_policy, num_episodes=batch_size)
         
         self.env_list.append(env)
         self.driver_list.append(driver)
     
     if env_schedule is None:
         # regularly switch between environments
         self.env_schedule = lambda epoch: epoch % len(self.env_list)
     else:
         self.env_schedule = env_schedule
Exemplo n.º 2
0
    def __init__(self,
                 env_kwargs,
                 reward_kwargs,
                 batch_size,
                 action_script,
                 action_scale,
                 to_learn,
                 episode_length,
                 learn_residuals=False,
                 remote=False):
        """
        Args:
            env_kwargs (dict): optional parameters for training environment.
            reward_kwargs (dict): optional parameters for reward function.
            batch_size (int): number of episodes collected in parallel.
            action_script (str): name of action script. Action wrapper will 
                select actions from this script if they are not learned.
            action_scale (dict, str:float): dictionary mapping action dimensions
                to scaling factors. Action wrapper will rescale actions produced
                by the agent's neural net policy by these factors.
            to_learn (dict, str:bool): dictionary mapping action dimensions to 
                bool flags. Specifies if the action should be learned or scripted.
            episode_length (callable: int -> int): function that defines the 
                schedule for training episode durations. Takes as argument int 
                epoch number and returns int episode duration for this epoch.
            learn_residuals (bool): flag to learn residual over the scripted
                protocol. If False, will learn actions from scratch. If True,
                will learn a residual to be added to scripted protocol.
            remote (bool): flag for remote environment to close the connection
                to a client upon finishing the training.
        """
        self.episode_length = episode_length
        self.remote = remote
        # Create training env and wrap it
        env = env_init(batch_size=batch_size,
                       reward_kwargs=reward_kwargs,
                       **env_kwargs)
        module_name = 'rl_tools.action_script.' + action_script
        action_script = importlib.import_module(module_name)
        env = wrappers.ActionWrapper(env,
                                     action_script,
                                     action_scale,
                                     to_learn,
                                     learn_residuals=learn_residuals)

        # create dummy placeholder policy to initialize parent class
        dummy_policy = PolicyPlaceholder(env.time_step_spec(),
                                         env.action_spec())

        super().__init__(env, dummy_policy, num_episodes=batch_size)
Exemplo n.º 3
0
    #                   'postselect_0' : False}
    
    reward_kwargs = {'reward_mode' : 'stabilizers_v2',
                      'Delta' : 0.0, 'beta' : sqrt(pi),
                      'sample' : False}
    
    env = env_init(control_circuit='snap_and_displacement', reward_kwargs=reward_kwargs,
                   init='vac', T=9, batch_size=1, N=200, episode_length=9)
    
    action_script = 'snap_and_displacements'
    action_scale = {'alpha':6, 'theta':pi}
    to_learn = {'alpha':True, 'theta':True}
    
    module_name = 'rl_tools.action_script.' + action_script
    action_script = importlib.import_module(module_name)
    env = wrappers.ActionWrapper(env, action_script, action_scale, to_learn)

    delta_dir = os.path.join(root_dir, 'delta' + str(Delta))
    seed_dir = os.path.join(delta_dir, best_seed[Delta])
    policy_dir = r'policy\010000'
    policy = tf.compat.v2.saved_model.load(os.path.join(seed_dir,policy_dir))
    

    time_step = env.reset()
    policy_state = policy.get_initial_state(env.batch_size)
    while not time_step.is_last():
        action_step = policy.action(time_step, policy_state)
        policy_state = action_step.state
        time_step = env.step(action_step.action)
    env.render()
    
Exemplo n.º 4
0
    'phi': True,
    'flip': True,
    'cavity_phase': True,
    'Kerr_drive_amp': True,
    'alpha_correction': True,
    'qb_detune': True,
    'qb_drag': True
}

env = env_init(batch_size=1, **env_kwargs, episode_length=env_kwargs['T'])

action_script_obj = importlib.import_module('rl_tools.action_script.' +
                                            action_script)
env = wrappers.ActionWrapper(env,
                             action_script_obj,
                             action_scale,
                             to_learn,
                             learn_residuals=True)

policy_dir = 'policy\\' + policy_str
policy = tf.compat.v2.saved_model.load(
    os.path.join(root_dir, exp_name, policy_dir))

time_step = env.reset()
policy_state = policy.get_initial_state(env.batch_size)
while not time_step.is_last():
    action_step = policy.action(time_step, policy_state)
    policy_state = action_step.state
    time_step = env.step(action_step.action)

actions = {