def call(self, inputs, step_type=(), network_state=(), training=False): flat_inputs = tf.nest.flatten(inputs) del step_type # unused. processed_inputs = [] for single_input, input_spec in zip(flat_inputs, self._flat_specs): if common_lib.is_categorical_spec(input_spec): if input_spec.name == 'step_num': if self._step_encoding is None: continue if self._max_trajectory_length_train is not None: max_step = self._max_trajectory_length_train else: max_step = input_spec.maximum processed_input = self._process_step_num( single_input, max_step) else: processed_input = tf.one_hot(single_input, input_spec.maximum + 1) else: if len(input_spec.shape) != 1: # Only allow vector inputs. raise ValueError('Invalid input spec shape %s.' % input_spec.shape) processed_input = single_input processed_inputs.append(processed_input) joint = tf.concat(processed_inputs, -1) for layer in self._fc_layers: joint = layer(joint, training=training) if self._output_dim is None: joint = tf.reshape(joint, [-1]) return joint, network_state
def __init__(self, dataset_spec, policy_optimizer, gamma: Union[float, tf.Tensor], z_learning_rate=0.5, v_learning_rate=0.5, entropy_reg=0.1, reward_fn: Optional[Callable] = None): """Initializes the solver. Args: dataset_spec: The spec of the dataset that will be given. policy_optimizer: TF optimizer for distilling policy from z. gamma: The discount factor to use. z_learning_rate: Learning rate for z. v_learning_rate: Learning rate for v. entropy_reg; Coefficient on entropy regularization. reward_fn: A function that takes in an EnvStep and returns the reward for that step. If not specified, defaults to just EnvStep.reward. """ self._dataset_spec = dataset_spec self._policy_optimizer = policy_optimizer self._z_learning_rate = z_learning_rate self._v_learning_rate = v_learning_rate self._entropy_reg = entropy_reg self._gamma = gamma if reward_fn is None: reward_fn = lambda env_step: env_step.reward self._reward_fn = reward_fn # Get number of states/actions. observation_spec = self._dataset_spec.observation action_spec = self._dataset_spec.action if not common_lib.is_categorical_spec(observation_spec): raise ValueError('Observation spec must be discrete and bounded.') self._num_states = observation_spec.maximum + 1 if not common_lib.is_categorical_spec(action_spec): raise ValueError('Action spec must be discrete and bounded.') self._num_actions = action_spec.maximum + 1 self._zetas = np.zeros([self._num_states * self._num_actions]) self._values = np.zeros([self._num_states]) self._policy = tf.Variable( np.zeros([self._num_states, self._num_actions]))
def __init__(self, policy, epsilon, emit_log_probability=True): self._wrapped_policy = policy self._epsilon = epsilon if not common_lib.is_categorical_spec(policy.action_spec): raise ValueError('Action spec must be categorical to define ' 'epsilon-greedy policy.') super(EpsilonGreedyPolicy, self).__init__(policy.time_step_spec, policy.action_spec, policy.policy_state_spec, policy.info_spec, emit_log_probability=emit_log_probability)
def call(self, inputs, step_type=(), network_state=(), training=False, mask=None): flat_inputs = tf.nest.flatten(inputs) del step_type # unused. processed_inputs = [] for single_input, input_spec in zip(flat_inputs, self._flat_specs): if common_lib.is_categorical_spec(input_spec): processed_input = tf.one_hot(single_input, input_spec.maximum + 1) else: if len(input_spec.shape) != 1: # Only allow vector inputs. raise ValueError('Invalid input spec shape %s.' % input_spec.shape) processed_input = single_input processed_inputs.append(processed_input) joint = tf.concat(processed_inputs, -1) for layer in self._fc_layers: joint = layer(joint, training=training) outer_rank = nest_utils.get_outer_rank(inputs, self.input_tensor_spec) def call_projection_net(proj_net): distribution, _ = proj_net(joint, outer_rank, training=training, mask=mask) return distribution output_actions = tf.nest.map_structure(call_projection_net, self._projection_networks) return output_actions, network_state
def __init__(self, dataset_spec, nu_network, zeta_network, nu_optimizer, zeta_optimizer, gamma: Union[float, tf.Tensor], reward_fn: Callable = None, solve_for_state_action_ratio: bool = True, f_exponent: float = 1.5, primal_form: bool = False, num_samples: Optional[int] = None, nu_regularizer: float = 0., zeta_regularizer: float = 0.): """Initializes the solver. Args: dataset_spec: The spec of the dataset that will be given. nu_network: The nu-value network. zeta_network: The zeta-value network. nu_optimizer: The optimizer to use for nu. zeta_optimizer: The optimizer to use for zeta. gamma: The discount factor to use. reward_fn: A function that takes in an EnvStep and returns the reward for that step. If not specified, defaults to just EnvStep.reward. solve_for_state_action_ratio: Whether to solve for state-action density ratio. Defaults to True. f_exponent: Exponent p to use for f(x) = |x|^p / p. primal_form: Whether to use primal form of DualDICE, which optimizes for nu independent of zeta. This form is biased in stochastic environments. Defaults to False, which uses the saddle-point formulation of DualDICE. num_samples: Number of samples to take from policy to estimate average next nu value. If actions are discrete, this defaults to computing average explicitly. If actions are not discrete, this defaults to using a single sample. nu_regularizer: Regularization coefficient on nu network. zeta_regularizer: Regularization coefficient on zeta network. """ self._dataset_spec = dataset_spec self._nu_network = nu_network self._nu_network.create_variables() self._zeta_network = zeta_network self._zeta_network.create_variables() self._nu_optimizer = nu_optimizer self._zeta_optimizer = zeta_optimizer self._nu_regularizer = nu_regularizer self._zeta_regularizer = zeta_regularizer self._gamma = gamma if reward_fn is None: reward_fn = lambda env_step: env_step.reward self._reward_fn = reward_fn self._num_samples = num_samples self._solve_for_state_action_ratio = solve_for_state_action_ratio if (not self._solve_for_state_action_ratio and not self._dataset_spec.has_log_probability()): raise ValueError('Dataset must contain log-probability when ' 'solve_for_state_action_ratio is False.') if f_exponent <= 1: raise ValueError('Exponent for f must be greater than 1.') fstar_exponent = f_exponent / (f_exponent - 1) self._f_fn = lambda x: tf.abs(x)**f_exponent / f_exponent self._fstar_fn = lambda x: tf.abs(x)**fstar_exponent / fstar_exponent self._categorical_action = common_lib.is_categorical_spec( self._dataset_spec.action) if not self._categorical_action and self._num_samples is None: self._num_samples = 1 self._primal_form = primal_form self._initialize()
def __init__( self, dataset_spec, alpha_optimizer, gamma: Union[float, tf.Tensor], divergence_limit: Union[float, np.ndarray, tf.Tensor], reward_fn: Callable = None, solve_for_state_action_ratio: bool = True, divergence_type: Text = 'rkl', #'chi2', algae_alpha: Union[float, tf.Tensor] = 1.0, weight_by_gamma: bool = True, limit_episodes: Optional[int] = None, num_samples: Optional[int] = None): """Initializes the solver. Args: dataset_spec: The spec of the dataset that will be given. weight_network: The weights network. weight_optimizer: The optimizer to use for the weights. alpha_optimizer: The optimizer to use for Lagrange multipliers on weights. gamma: The discount factor to use. reward_fn: A function that takes in an EnvStep and returns the reward for that step. If not specified, defaults to just EnvStep.reward. solve_for_state_action_ratio: Whether to solve for state-action density ratio. Defaults to False, which instead solves for state density ratio. Although the estimated policy value should be the same, approximating using the state density ratio is much faster (especially in large environments) and more accurate (especially in low-data regimes). divergence_limit: The limit on the f-divergence between the weights and the empirical distribution. This should contain half as many elements as outputted by the nu, zeta, and weight networks. divergence_type: The type of f-divergence to use, e.g., 'kl'. algae_alpha: Regularizer coefficient on Df(dpi || dD). closed_form_weights: Whether to use closed-form weights. If true, weight_network and weight_optimizer are ignored. weight_by_gamma: Weight nu and zeta losses by gamma ** step_num. limit_episodes: How many episodes to take from the dataset. Defaults to None (take whole dataset). """ self._dataset_spec = dataset_spec self._gamma = gamma if reward_fn is None: reward_fn = lambda env_step: env_step.reward self._reward_fn = reward_fn self._solve_for_state_action_ratio = solve_for_state_action_ratio if (not self._solve_for_state_action_ratio and not self._dataset_spec.has_log_probability()): raise ValueError('Dataset must contain log-probability when ' 'solve_for_state_action_ratio is False.') # Get number of states/actions. observation_spec = self._dataset_spec.observation action_spec = self._dataset_spec.action if not tabular_dual_dice._is_categorical_spec(observation_spec): raise ValueError('Observation spec must be discrete and bounded.') self._num_states = observation_spec.maximum + 1 if not tabular_dual_dice._is_categorical_spec(action_spec): raise ValueError('Action spec must be discrete and bounded.') self._num_actions = action_spec.maximum + 1 self._dimension = 1 + (self._num_states * self._num_actions if self._solve_for_state_action_ratio else self._num_states) # For learning data weight self._divergence_limit = tf.convert_to_tensor(divergence_limit, dtype=tf.float32) if tf.rank(self._divergence_limit) < 1: self._divergence_limit = tf.expand_dims(self._divergence_limit, -1) self._two_sided_limit = tf.concat( [self._divergence_limit, self._divergence_limit], -1) self._num_limits = int(self._two_sided_limit.shape[0]) # The lagrange multiplier w.r.t. data weight constraint self._alpha = tf.Variable(np.zeros(self._two_sided_limit.shape), dtype=tf.float32) self._alpha_optimizer = alpha_optimizer self._algae_alpha = tf.convert_to_tensor(algae_alpha, dtype=tf.float32) if tf.rank(self._algae_alpha) < 1: self._algae_alpha = tf.expand_dims(self._algae_alpha, -1) if self._algae_alpha.shape[-1] != self._two_sided_limit.shape[-1]: self._algae_alpha *= tf.ones_like(self._two_sided_limit) self._algae_alpha_sign = 2 * ( tf.cast(self._algae_alpha >= 0, tf.float32) - 0.5) self._num_samples = num_samples self._categorical_action = common_lib.is_categorical_spec( self._dataset_spec.action) if not self._categorical_action and self._num_samples is None: self._num_samples = 1 self._divergence_type = divergence_type if self._divergence_type not in ['kl', 'rkl', 'chi2']: raise ValueError('Unsupported divergence type %s.' % self._divergence_type) self._nu = tf.zeros([self._dimension, self._num_limits]) self._nu2 = tf.zeros([self._dimension, self._num_limits]) self._zeta = tf.zeros([self._dimension, self._num_limits]) self._zeta2 = tf.zeros([self._dimension, self._num_limits]) self._weight_by_gamma = weight_by_gamma self._limit_episodes = limit_episodes
def __init__(self, dataset_spec, gamma: Union[float, tf.Tensor], reward_fn: Callable = None, solve_for_state_action_ratio: bool = True, divergence_limit: Union[float, np.ndarray, tf.Tensor] = 0.0, divergence_type: Text = 'rkl', nu_learning_rate: Union[float, tf.Tensor] = 0.1, zeta_learning_rate: Union[float, tf.Tensor] = 0.1, algae_alpha: Union[float, tf.Tensor] = 1.0, limit_episodes: Optional[int] = None): """Initializes the solver. Args: dataset_spec: The spec of the dataset that will be given. gamma: The discount factor to use. reward_fn: A function that takes in an EnvStep and returns the reward for that step. If not specified, defaults to just EnvStep.reward. solve_for_state_action_ratio: Whether to solve for state-action density ratio. Defaults to True. When solving an environment with a large state/action space (taxi), better to set this to False to avoid OOM issues. divergence_limit: The limit on the f-divergence between the weights and the empirical distribution. divergence_type: The type of f-divergence to use, e.g., 'kl'. Defaults to 'rkl', reverse KL. nu_learning_rate: Learning rate for nu. zeta_learning_rate: Learning rate for zeta. algae_alpha: Regularizer coefficient on Df(dpi || dD). limit_episodes: How many episodes to take from the dataset. Defaults to None (take whole dataset). """ self._dataset_spec = dataset_spec self._gamma = gamma if reward_fn is None: reward_fn = lambda env_step: env_step.reward self._reward_fn = reward_fn self._solve_for_state_action_ratio = solve_for_state_action_ratio if (not self._solve_for_state_action_ratio and not self._dataset_spec.has_log_probability()): raise ValueError('Dataset must contain log-probability when ' 'solve_for_state_action_ratio is False.') # Get number of states/actions. observation_spec = self._dataset_spec.observation action_spec = self._dataset_spec.action if not common_lib.is_categorical_spec(observation_spec): raise ValueError('Observation spec must be discrete and bounded.') self._num_states = observation_spec.maximum + 1 if not common_lib.is_categorical_spec(action_spec): raise ValueError('Action spec must be discrete and bounded.') self._num_actions = action_spec.maximum + 1 self._dimension = 1 + ( self._num_states * self._num_actions if self._solve_for_state_action_ratio else self._num_states) # For learning data weight self._divergence_limit = tf.convert_to_tensor( divergence_limit, dtype=tf.float32) if tf.rank(self._divergence_limit) < 1: self._divergence_limit = tf.expand_dims(self._divergence_limit, -1) self._two_sided_limit = tf.concat( [self._divergence_limit, self._divergence_limit], -1) self._num_limits = int(self._two_sided_limit.shape[0]) # The lagrange multiplier w.r.t. data weight constraint self._alpha = tf.Variable( np.zeros(self._two_sided_limit.shape), dtype=tf.float32) self._algae_alpha = tf.convert_to_tensor(algae_alpha, dtype=tf.float32) if tf.rank(self._algae_alpha) < 1: self._algae_alpha = tf.expand_dims(self._algae_alpha, -1) if self._algae_alpha.shape[-1] != self._two_sided_limit.shape[-1]: self._algae_alpha *= tf.ones_like(self._two_sided_limit) self._algae_alpha_sign = 2 * ( tf.cast(self._algae_alpha >= 0, tf.float32) - 0.5) self._divergence_type = divergence_type if self._divergence_type not in ['kl', 'rkl', 'chi2']: raise ValueError('Unsupported divergence type %s.' % self._divergence_type) self._nu_learning_rate = nu_learning_rate self._zeta_learning_rate = zeta_learning_rate # We have two variables to counteract the bias introduced by algae_alpha. self._nu = tf.zeros([self._dimension, self._num_limits]) self._nu2 = tf.zeros([self._dimension, self._num_limits]) self._zeta = tf.zeros([self._dimension, self._num_limits]) self._zeta2 = tf.zeros([self._dimension, self._num_limits]) self._limit_episodes = limit_episodes
def __init__(self, dataset_spec, gamma: Union[float, tf.Tensor], reward_fn: Optional[Callable] = None, solve_for_state_action_ratio: bool = True, nu_learning_rate: Union[float, tf.Tensor] = 0.1, zeta_learning_rate: Union[float, tf.Tensor] = 0.1, kl_regularizer: Union[float, tf.Tensor] = 1., eps_std: Union[float, tf.Tensor] = 1): """Initializes the solver. Args: dataset_spec: The spec of the dataset that will be given. gamma: The discount factor to use. reward_fn: A function that takes in an EnvStep and returns the reward for that step. If not specified, defaults to just EnvStep.reward. solve_for_state_action_ratio: Whether to solve for state-action density ratio. Defaults to True. When solving an environment with a large state/action space (taxi), better to set this to False to avoid OOM issues. nu_learning_rate: Learning rate for nu. zeta_learning_rate: Learning rate for zeta. kl_regularizer: Regularization constant for D_kl(q || p). eps_std: epsilon standard deviation for sampling from the posterior. """ self._dataset_spec = dataset_spec self._gamma = gamma if reward_fn is None: reward_fn = lambda env_step: env_step.reward self._reward_fn = reward_fn self._kl_regularizer = kl_regularizer self._eps_std = eps_std self._solve_for_state_action_ratio = solve_for_state_action_ratio if (not self._solve_for_state_action_ratio and not self._dataset_spec.has_log_probability()): raise ValueError('Dataset must contain log-probability when ' 'solve_for_state_action_ratio is False.') # Get number of states/actions. observation_spec = self._dataset_spec.observation action_spec = self._dataset_spec.action if not common_lib.is_categorical_spec(observation_spec): raise ValueError('Observation spec must be discrete and bounded.') self._num_states = observation_spec.maximum + 1 if not common_lib.is_categorical_spec(action_spec): raise ValueError('Action spec must be discrete and bounded.') self._num_actions = action_spec.maximum + 1 self._dimension = (self._num_states * self._num_actions if self._solve_for_state_action_ratio else self._num_states) self._td_residuals = np.zeros([self._dimension, self._dimension]) self._total_weights = np.zeros([self._dimension]) self._initial_weights = np.zeros([self._dimension]) self._nu_optimizer = tf.keras.optimizers.Adam(nu_learning_rate) self._zeta_optimizer = tf.keras.optimizers.Adam(zeta_learning_rate) # Initialize variational Bayes parameters self._nu_mu = tf.Variable(tf.zeros([self._dimension])) self._nu_log_sigma = tf.Variable(tf.zeros([self._dimension])) self._prior_mu = tf.Variable(tf.zeros([self._dimension]), trainable=True) self._prior_log_sigma = tf.Variable(tf.zeros([self._dimension]), trainable=False)
def __init__(self, dataset_spec, policy_optimizer, policy_network, mode, ci_method, delta_tail, gamma: Union[float, tf.Tensor], reward_fn: Callable = None, clipping: Optional[float] = 2000., policy_regularizer: float = 0., q_network=None, q_optimizer=None, target_update_tau: Union[float, tf.Tensor] = 0.01, target_update_period: int = 1, num_samples: Optional[int] = None): """Initializes the importance sampling estimator. Args: dataset_spec: The spec of the dataset that will be given. policy_optimizer: The optimizer to use for learning policy. policy_network: The policy NN network. mode: Importance sampling estimator (e.g., "weighted-step-wise"). ci_method: Method for constructing confidence intervals (e.g., "CH" for Chernoff-Hoeffding). delta_tail: Total probability quantile threshold (will be halved in code for 2-tail) gamma: The discount factor to use. reward_fn: A function that takes in an EnvStep and returns the reward for that step. If not specified, defaults to just EnvStep.reward. clipping: Threshold for clipping IS factor. policy_regularizer: float on policy regularizer. q_network: A function that returns the values for each observation and action. If specified, the Q-values are learned and used for doubly-robust estimation. q_optimizer: TF optimizer for q_network. target_update_tau: Rate at which to set target network parameters. target_update_period: Rate at which to set target network parameters. num_samples: Number of samples to take from policy to estimate average next state value. If actions are discrete, this defaults to computing average explicitly. If actions are not discrete, this defaults to using a single sample. """ self._dataset_spec = dataset_spec self._policy_optimizer = policy_optimizer self._policy_network = policy_network if self._policy_network is not None: self._policy_network.create_variables() self._mode = mode self._ci_method = ci_method self._delta_tail = delta_tail self._gamma = gamma if reward_fn is None: reward_fn = lambda env_step: env_step.reward self._reward_fn = reward_fn self._clipping = clipping self._policy_regularizer = policy_regularizer self._q_network = q_network if self._q_network is not None: self._q_network.create_variables() self._target_network = self._q_network.copy(name='TargetQNetwork') self._target_network.create_variables() self._target_update_tau = target_update_tau self._target_update_period = target_update_period self._update_targets = self._get_target_updater( tau=self._target_update_tau, period=self._target_update_period) self._q_optimizer = q_optimizer self._initialize() self._num_samples = num_samples self._categorical_action = common_lib.is_categorical_spec( self._dataset_spec.action) if not self._categorical_action and self._num_samples is None: self._num_samples = 1