def __init__(self, env_kwargs_list, rew_kwargs_list, batch_size, action_script, action_scale, to_learn, episode_length_list, env_schedule=None): """ Args: env_kwargs_list (list[dict]): list of parameters for training environment. reward_kwargs_list (list[dict]): list of parameters for reward functions. Should correspond to 'env_kwargs_list'. batch_size (int): number of episodes collected in parallel. action_script (str): name of action script. Action wrapper will select actions from this script if they are not learned. action_scale (dict, str:float): dictionary mapping action dimensions to scaling factors. Action wrapper will rescale actions produced by the agent's neural net policy by these factors. to_learn (dict, str:bool): dictionary mapping action dimensions to bool flags. Specifies if the action should be learned or scripted. episode_length_list (list[callable: int -> int]): list of schedule functions for episode durations. Schedule functions take as argument int epoch number and return int episode duration for this epoch. The list should correspond to 'env_kwargs_list'. env_schedule (callable): function mapping epoch number to index of the environment from the list to use during this epoch """ self.env_list, self.driver_list = [], [] self.episode_length_list = episode_length_list for env_kwargs, rew_kwargs in zip(env_kwargs_list, rew_kwargs_list): # Create training env and wrap it env = gkp_init(batch_size=batch_size, reward_kwargs=rew_kwargs, **env_kwargs) action_script_m = action_scripts.__getattribute__(action_script) env = wrappers.ActionWrapper(env, action_script_m, action_scale, to_learn) # create dummy placeholder policy to initialize driver dummy_policy = PolicyPlaceholder(env.time_step_spec(), env.action_spec()) # create driver for this environment driver = dynamic_episode_driver.DynamicEpisodeDriver( env, dummy_policy, num_episodes=batch_size) self.env_list.append(env) self.driver_list.append(driver) if env_schedule is None: # regularly switch between environments self.env_schedule = lambda epoch: epoch % len(self.env_list) else: self.env_schedule = env_schedule
def __init__(self, env_kwargs, reward_kwargs, batch_size, action_script, action_scale, to_learn, episode_length, learn_residuals=False): """ Args: env_kwargs (dict): optional parameters for training environment. reward_kwargs (dict): optional parameters for reward function. batch_size (int): number of episodes collected in parallel. action_script (str): name of action script. Action wrapper will select actions from this script if they are not learned. action_scale (dict, str:float): dictionary mapping action dimensions to scaling factors. Action wrapper will rescale actions produced by the agent's neural net policy by these factors. to_learn (dict, str:bool): dictionary mapping action dimensions to bool flags. Specifies if the action should be learned or scripted. episode_length (callable: int -> int): function that defines the schedule for training episode durations. Takes as argument int epoch number and returns int episode duration for this epoch. learn_residuals (bool): flag to learn residual over the scripted protocol. If False, will learn actions from scratch. If True, will learn a residual to be added to scripted protocol. """ self.episode_length = episode_length # Create training env and wrap it env = gkp_init(batch_size=batch_size, reward_kwargs=reward_kwargs, **env_kwargs) action_script = action_scripts.__getattribute__(action_script) env = wrappers.ActionWrapper(env, action_script, action_scale, to_learn, learn_residuals=learn_residuals) # create dummy placeholder policy to initialize parent class dummy_policy = PolicyPlaceholder(env.time_step_spec(), env.action_spec()) super().__init__(env, dummy_policy, num_episodes=batch_size)
import gkp.action_script as action_scripts from gkp.gkp_tf_env import tf_env_wrappers as wrappers env_kwargs = dict(simulate='snap_and_displacement_miscalibrated', init='vac', H=1, T=5, attn_step=1, N=50, batch_size=1000, episode_length=5) target_state = qt.tensor(qt.basis(2,0), qt.basis(50,3)) reward_kwargs = {'reward_mode' : 'overlap', 'target_state' : target_state, 'postselect_0' : False} action_script = 'snap_and_displacements' action_scale = {'alpha':4, 'theta':pi} to_learn = {'alpha':True, 'theta':True} action_script = action_scripts.__getattribute__(action_script) protocol = 'ideal' max_epochs = 3000 gate_times = [0.4e-6, 3.4e-6] seeds = ['seed2'] rewards = {t:{} for t in gate_times} norms = {t:{} for t in gate_times} for t in gate_times: env = gkp_init(**env_kwargs, reward_kwargs=reward_kwargs) env = wrappers.ActionWrapper(env, action_script, action_scale, to_learn) env._env.SNAP_miscalibrated.T = t env._env.bit_string = None # '00000' # collect episodes with different policies for sim_name in seeds: #os.listdir(root_dir[protocol]):