def __init__(self, env_kwargs_list, rew_kwargs_list, batch_size, action_script, action_scale, to_learn, episode_length_list, env_schedule=None): """ Args: env_kwargs_list (list[dict]): list of parameters for training environment. reward_kwargs_list (list[dict]): list of parameters for reward functions. Should correspond to 'env_kwargs_list'. batch_size (int): number of episodes collected in parallel. action_script (str): name of action script. Action wrapper will select actions from this script if they are not learned. action_scale (dict, str:float): dictionary mapping action dimensions to scaling factors. Action wrapper will rescale actions produced by the agent's neural net policy by these factors. to_learn (dict, str:bool): dictionary mapping action dimensions to bool flags. Specifies if the action should be learned or scripted. episode_length_list (list[callable: int -> int]): list of schedule functions for episode durations. Schedule functions take as argument int epoch number and return int episode duration for this epoch. The list should correspond to 'env_kwargs_list'. env_schedule (callable): function mapping epoch number to index of the environment from the list to use during this epoch """ self.env_list, self.driver_list = [], [] self.episode_length_list = episode_length_list for env_kwargs, rew_kwargs in zip(env_kwargs_list, rew_kwargs_list): # Create training env and wrap it env = env_init(batch_size=batch_size, reward_kwargs=rew_kwargs, **env_kwargs) action_script_m = action_scripts.__getattribute__(action_script) env = wrappers.ActionWrapper(env, action_script_m, action_scale, to_learn) # create dummy placeholder policy to initialize driver dummy_policy = PolicyPlaceholder( env.time_step_spec(), env.action_spec()) # create driver for this environment driver = dynamic_episode_driver.DynamicEpisodeDriver( env, dummy_policy, num_episodes=batch_size) self.env_list.append(env) self.driver_list.append(driver) if env_schedule is None: # regularly switch between environments self.env_schedule = lambda epoch: epoch % len(self.env_list) else: self.env_schedule = env_schedule
def __init__(self, env_kwargs, reward_kwargs, batch_size, action_script, action_scale, to_learn, episode_length, learn_residuals=False, remote=False): """ Args: env_kwargs (dict): optional parameters for training environment. reward_kwargs (dict): optional parameters for reward function. batch_size (int): number of episodes collected in parallel. action_script (str): name of action script. Action wrapper will select actions from this script if they are not learned. action_scale (dict, str:float): dictionary mapping action dimensions to scaling factors. Action wrapper will rescale actions produced by the agent's neural net policy by these factors. to_learn (dict, str:bool): dictionary mapping action dimensions to bool flags. Specifies if the action should be learned or scripted. episode_length (callable: int -> int): function that defines the schedule for training episode durations. Takes as argument int epoch number and returns int episode duration for this epoch. learn_residuals (bool): flag to learn residual over the scripted protocol. If False, will learn actions from scratch. If True, will learn a residual to be added to scripted protocol. remote (bool): flag for remote environment to close the connection to a client upon finishing the training. """ self.episode_length = episode_length self.remote = remote # Create training env and wrap it env = env_init(batch_size=batch_size, reward_kwargs=reward_kwargs, **env_kwargs) module_name = 'rl_tools.action_script.' + action_script action_script = importlib.import_module(module_name) env = wrappers.ActionWrapper(env, action_script, action_scale, to_learn, learn_residuals=learn_residuals) # create dummy placeholder policy to initialize parent class dummy_policy = PolicyPlaceholder(env.time_step_spec(), env.action_spec()) super().__init__(env, dummy_policy, num_episodes=batch_size)
# 'postselect_0' : False} reward_kwargs = {'reward_mode' : 'stabilizers_v2', 'Delta' : 0.0, 'beta' : sqrt(pi), 'sample' : False} env = env_init(control_circuit='snap_and_displacement', reward_kwargs=reward_kwargs, init='vac', T=9, batch_size=1, N=200, episode_length=9) action_script = 'snap_and_displacements' action_scale = {'alpha':6, 'theta':pi} to_learn = {'alpha':True, 'theta':True} module_name = 'rl_tools.action_script.' + action_script action_script = importlib.import_module(module_name) env = wrappers.ActionWrapper(env, action_script, action_scale, to_learn) delta_dir = os.path.join(root_dir, 'delta' + str(Delta)) seed_dir = os.path.join(delta_dir, best_seed[Delta]) policy_dir = r'policy\010000' policy = tf.compat.v2.saved_model.load(os.path.join(seed_dir,policy_dir)) time_step = env.reset() policy_state = policy.get_initial_state(env.batch_size) while not time_step.is_last(): action_step = policy.action(time_step, policy_state) policy_state = action_step.state time_step = env.step(action_step.action) env.render()
'phi': True, 'flip': True, 'cavity_phase': True, 'Kerr_drive_amp': True, 'alpha_correction': True, 'qb_detune': True, 'qb_drag': True } env = env_init(batch_size=1, **env_kwargs, episode_length=env_kwargs['T']) action_script_obj = importlib.import_module('rl_tools.action_script.' + action_script) env = wrappers.ActionWrapper(env, action_script_obj, action_scale, to_learn, learn_residuals=True) policy_dir = 'policy\\' + policy_str policy = tf.compat.v2.saved_model.load( os.path.join(root_dir, exp_name, policy_dir)) time_step = env.reset() policy_state = policy.get_initial_state(env.batch_size) while not time_step.is_last(): action_step = policy.action(time_step, policy_state) policy_state = action_step.state time_step = env.step(action_step.action) actions = {