예제 #1
0
    def call(self, inputs, step_type=(), network_state=(), training=False):
        flat_inputs = tf.nest.flatten(inputs)
        del step_type  # unused.

        processed_inputs = []
        for single_input, input_spec in zip(flat_inputs, self._flat_specs):
            if common_lib.is_categorical_spec(input_spec):
                if input_spec.name == 'step_num':
                    if self._step_encoding is None:
                        continue

                    if self._max_trajectory_length_train is not None:
                        max_step = self._max_trajectory_length_train
                    else:
                        max_step = input_spec.maximum
                    processed_input = self._process_step_num(
                        single_input, max_step)
                else:
                    processed_input = tf.one_hot(single_input,
                                                 input_spec.maximum + 1)
            else:
                if len(input_spec.shape) != 1:  # Only allow vector inputs.
                    raise ValueError('Invalid input spec shape %s.' %
                                     input_spec.shape)
                processed_input = single_input
            processed_inputs.append(processed_input)

        joint = tf.concat(processed_inputs, -1)
        for layer in self._fc_layers:
            joint = layer(joint, training=training)

        if self._output_dim is None:
            joint = tf.reshape(joint, [-1])

        return joint, network_state
예제 #2
0
    def __init__(self,
                 dataset_spec,
                 policy_optimizer,
                 gamma: Union[float, tf.Tensor],
                 z_learning_rate=0.5,
                 v_learning_rate=0.5,
                 entropy_reg=0.1,
                 reward_fn: Optional[Callable] = None):
        """Initializes the solver.

    Args:
      dataset_spec: The spec of the dataset that will be given.
      policy_optimizer: TF optimizer for distilling policy from z.
      gamma: The discount factor to use.
      z_learning_rate: Learning rate for z.
      v_learning_rate: Learning rate for v. entropy_reg; Coefficient on entropy
        regularization.
      reward_fn: A function that takes in an EnvStep and returns the reward for
        that step. If not specified, defaults to just EnvStep.reward.
    """
        self._dataset_spec = dataset_spec
        self._policy_optimizer = policy_optimizer
        self._z_learning_rate = z_learning_rate
        self._v_learning_rate = v_learning_rate
        self._entropy_reg = entropy_reg

        self._gamma = gamma
        if reward_fn is None:
            reward_fn = lambda env_step: env_step.reward
        self._reward_fn = reward_fn

        # Get number of states/actions.
        observation_spec = self._dataset_spec.observation
        action_spec = self._dataset_spec.action
        if not common_lib.is_categorical_spec(observation_spec):
            raise ValueError('Observation spec must be discrete and bounded.')
        self._num_states = observation_spec.maximum + 1

        if not common_lib.is_categorical_spec(action_spec):
            raise ValueError('Action spec must be discrete and bounded.')
        self._num_actions = action_spec.maximum + 1

        self._zetas = np.zeros([self._num_states * self._num_actions])
        self._values = np.zeros([self._num_states])
        self._policy = tf.Variable(
            np.zeros([self._num_states, self._num_actions]))
예제 #3
0
    def __init__(self, policy, epsilon, emit_log_probability=True):
        self._wrapped_policy = policy
        self._epsilon = epsilon
        if not common_lib.is_categorical_spec(policy.action_spec):
            raise ValueError('Action spec must be categorical to define '
                             'epsilon-greedy policy.')

        super(EpsilonGreedyPolicy,
              self).__init__(policy.time_step_spec,
                             policy.action_spec,
                             policy.policy_state_spec,
                             policy.info_spec,
                             emit_log_probability=emit_log_probability)
예제 #4
0
    def call(self,
             inputs,
             step_type=(),
             network_state=(),
             training=False,
             mask=None):

        flat_inputs = tf.nest.flatten(inputs)
        del step_type  # unused.

        processed_inputs = []
        for single_input, input_spec in zip(flat_inputs, self._flat_specs):
            if common_lib.is_categorical_spec(input_spec):
                processed_input = tf.one_hot(single_input,
                                             input_spec.maximum + 1)
            else:
                if len(input_spec.shape) != 1:  # Only allow vector inputs.
                    raise ValueError('Invalid input spec shape %s.' %
                                     input_spec.shape)
                processed_input = single_input
            processed_inputs.append(processed_input)

        joint = tf.concat(processed_inputs, -1)
        for layer in self._fc_layers:
            joint = layer(joint, training=training)

        outer_rank = nest_utils.get_outer_rank(inputs, self.input_tensor_spec)

        def call_projection_net(proj_net):
            distribution, _ = proj_net(joint,
                                       outer_rank,
                                       training=training,
                                       mask=mask)
            return distribution

        output_actions = tf.nest.map_structure(call_projection_net,
                                               self._projection_networks)
        return output_actions, network_state
예제 #5
0
    def __init__(self,
                 dataset_spec,
                 nu_network,
                 zeta_network,
                 nu_optimizer,
                 zeta_optimizer,
                 gamma: Union[float, tf.Tensor],
                 reward_fn: Callable = None,
                 solve_for_state_action_ratio: bool = True,
                 f_exponent: float = 1.5,
                 primal_form: bool = False,
                 num_samples: Optional[int] = None,
                 nu_regularizer: float = 0.,
                 zeta_regularizer: float = 0.):
        """Initializes the solver.

    Args:
      dataset_spec: The spec of the dataset that will be given.
      nu_network: The nu-value network.
      zeta_network: The zeta-value network.
      nu_optimizer: The optimizer to use for nu.
      zeta_optimizer: The optimizer to use for zeta.
      gamma: The discount factor to use.
      reward_fn: A function that takes in an EnvStep and returns the reward
        for that step. If not specified, defaults to just EnvStep.reward.
      solve_for_state_action_ratio: Whether to solve for state-action density
        ratio. Defaults to True.
      f_exponent: Exponent p to use for f(x) = |x|^p / p.
      primal_form: Whether to use primal form of DualDICE, which optimizes for
        nu independent of zeta. This form is biased in stochastic environments.
        Defaults to False, which uses the saddle-point formulation of DualDICE.
      num_samples: Number of samples to take from policy to estimate average
        next nu value. If actions are discrete, this defaults to computing
        average explicitly. If actions are not discrete, this defaults to using
        a single sample.
      nu_regularizer: Regularization coefficient on nu network.
      zeta_regularizer: Regularization coefficient on zeta network.
    """
        self._dataset_spec = dataset_spec
        self._nu_network = nu_network
        self._nu_network.create_variables()
        self._zeta_network = zeta_network
        self._zeta_network.create_variables()

        self._nu_optimizer = nu_optimizer
        self._zeta_optimizer = zeta_optimizer
        self._nu_regularizer = nu_regularizer
        self._zeta_regularizer = zeta_regularizer

        self._gamma = gamma
        if reward_fn is None:
            reward_fn = lambda env_step: env_step.reward
        self._reward_fn = reward_fn
        self._num_samples = num_samples

        self._solve_for_state_action_ratio = solve_for_state_action_ratio
        if (not self._solve_for_state_action_ratio
                and not self._dataset_spec.has_log_probability()):
            raise ValueError('Dataset must contain log-probability when '
                             'solve_for_state_action_ratio is False.')

        if f_exponent <= 1:
            raise ValueError('Exponent for f must be greater than 1.')
        fstar_exponent = f_exponent / (f_exponent - 1)
        self._f_fn = lambda x: tf.abs(x)**f_exponent / f_exponent
        self._fstar_fn = lambda x: tf.abs(x)**fstar_exponent / fstar_exponent

        self._categorical_action = common_lib.is_categorical_spec(
            self._dataset_spec.action)
        if not self._categorical_action and self._num_samples is None:
            self._num_samples = 1

        self._primal_form = primal_form
        self._initialize()
예제 #6
0
    def __init__(
            self,
            dataset_spec,
            alpha_optimizer,
            gamma: Union[float, tf.Tensor],
            divergence_limit: Union[float, np.ndarray, tf.Tensor],
            reward_fn: Callable = None,
            solve_for_state_action_ratio: bool = True,
            divergence_type: Text = 'rkl',  #'chi2',
            algae_alpha: Union[float, tf.Tensor] = 1.0,
            weight_by_gamma: bool = True,
            limit_episodes: Optional[int] = None,
            num_samples: Optional[int] = None):
        """Initializes the solver.

    Args:
      dataset_spec: The spec of the dataset that will be given.
      weight_network: The weights network.
      weight_optimizer: The optimizer to use for the weights.
      alpha_optimizer: The optimizer to use for Lagrange multipliers on weights.
      gamma: The discount factor to use.
      reward_fn: A function that takes in an EnvStep and returns the reward for
        that step. If not specified, defaults to just EnvStep.reward.
      solve_for_state_action_ratio: Whether to solve for state-action density
        ratio. Defaults to False, which instead solves for state density ratio.
        Although the estimated policy value should be the same, approximating
        using the state density ratio is much faster (especially in large
        environments) and more accurate (especially in low-data regimes).
      divergence_limit: The limit on the f-divergence between the weights and
        the empirical distribution. This should contain half as many elements as
        outputted by the nu, zeta, and weight networks.
      divergence_type: The type of f-divergence to use, e.g., 'kl'.
      algae_alpha: Regularizer coefficient on Df(dpi || dD).
      closed_form_weights: Whether to use closed-form weights. If true,
        weight_network and weight_optimizer are ignored.
      weight_by_gamma: Weight nu and zeta losses by gamma ** step_num.
      limit_episodes: How many episodes to take from the dataset. Defaults to
        None (take whole dataset).
    """
        self._dataset_spec = dataset_spec
        self._gamma = gamma
        if reward_fn is None:
            reward_fn = lambda env_step: env_step.reward
        self._reward_fn = reward_fn

        self._solve_for_state_action_ratio = solve_for_state_action_ratio
        if (not self._solve_for_state_action_ratio
                and not self._dataset_spec.has_log_probability()):
            raise ValueError('Dataset must contain log-probability when '
                             'solve_for_state_action_ratio is False.')

        # Get number of states/actions.
        observation_spec = self._dataset_spec.observation
        action_spec = self._dataset_spec.action
        if not tabular_dual_dice._is_categorical_spec(observation_spec):
            raise ValueError('Observation spec must be discrete and bounded.')
        self._num_states = observation_spec.maximum + 1

        if not tabular_dual_dice._is_categorical_spec(action_spec):
            raise ValueError('Action spec must be discrete and bounded.')
        self._num_actions = action_spec.maximum + 1
        self._dimension = 1 + (self._num_states * self._num_actions
                               if self._solve_for_state_action_ratio else
                               self._num_states)

        # For learning data weight
        self._divergence_limit = tf.convert_to_tensor(divergence_limit,
                                                      dtype=tf.float32)
        if tf.rank(self._divergence_limit) < 1:
            self._divergence_limit = tf.expand_dims(self._divergence_limit, -1)
        self._two_sided_limit = tf.concat(
            [self._divergence_limit, self._divergence_limit], -1)
        self._num_limits = int(self._two_sided_limit.shape[0])
        # The lagrange multiplier w.r.t. data weight constraint
        self._alpha = tf.Variable(np.zeros(self._two_sided_limit.shape),
                                  dtype=tf.float32)
        self._alpha_optimizer = alpha_optimizer

        self._algae_alpha = tf.convert_to_tensor(algae_alpha, dtype=tf.float32)
        if tf.rank(self._algae_alpha) < 1:
            self._algae_alpha = tf.expand_dims(self._algae_alpha, -1)
        if self._algae_alpha.shape[-1] != self._two_sided_limit.shape[-1]:
            self._algae_alpha *= tf.ones_like(self._two_sided_limit)
        self._algae_alpha_sign = 2 * (
            tf.cast(self._algae_alpha >= 0, tf.float32) - 0.5)

        self._num_samples = num_samples
        self._categorical_action = common_lib.is_categorical_spec(
            self._dataset_spec.action)
        if not self._categorical_action and self._num_samples is None:
            self._num_samples = 1

        self._divergence_type = divergence_type
        if self._divergence_type not in ['kl', 'rkl', 'chi2']:
            raise ValueError('Unsupported divergence type %s.' %
                             self._divergence_type)

        self._nu = tf.zeros([self._dimension, self._num_limits])
        self._nu2 = tf.zeros([self._dimension, self._num_limits])
        self._zeta = tf.zeros([self._dimension, self._num_limits])
        self._zeta2 = tf.zeros([self._dimension, self._num_limits])
        self._weight_by_gamma = weight_by_gamma
        self._limit_episodes = limit_episodes
예제 #7
0
  def __init__(self,
               dataset_spec,
               gamma: Union[float, tf.Tensor],
               reward_fn: Callable = None,
               solve_for_state_action_ratio: bool = True,
               divergence_limit: Union[float, np.ndarray, tf.Tensor] = 0.0,
               divergence_type: Text = 'rkl',
               nu_learning_rate: Union[float, tf.Tensor] = 0.1,
               zeta_learning_rate: Union[float, tf.Tensor] = 0.1,
               algae_alpha: Union[float, tf.Tensor] = 1.0,
               limit_episodes: Optional[int] = None):
    """Initializes the solver.

    Args:
      dataset_spec: The spec of the dataset that will be given.
      gamma: The discount factor to use.
      reward_fn: A function that takes in an EnvStep and returns the reward for
        that step. If not specified, defaults to just EnvStep.reward.
      solve_for_state_action_ratio: Whether to solve for state-action density
        ratio. Defaults to True. When solving an environment with a large
        state/action space (taxi), better to set this to False to avoid OOM
        issues.
      divergence_limit: The limit on the f-divergence between the weights and
        the empirical distribution.
      divergence_type: The type of f-divergence to use, e.g., 'kl'. Defaults to
        'rkl', reverse KL.
      nu_learning_rate: Learning rate for nu.
      zeta_learning_rate: Learning rate for zeta.
      algae_alpha: Regularizer coefficient on Df(dpi || dD).
      limit_episodes: How many episodes to take from the dataset. Defaults to
        None (take whole dataset).
    """
    self._dataset_spec = dataset_spec
    self._gamma = gamma
    if reward_fn is None:
      reward_fn = lambda env_step: env_step.reward
    self._reward_fn = reward_fn

    self._solve_for_state_action_ratio = solve_for_state_action_ratio
    if (not self._solve_for_state_action_ratio and
        not self._dataset_spec.has_log_probability()):
      raise ValueError('Dataset must contain log-probability when '
                       'solve_for_state_action_ratio is False.')

    # Get number of states/actions.
    observation_spec = self._dataset_spec.observation
    action_spec = self._dataset_spec.action
    if not common_lib.is_categorical_spec(observation_spec):
      raise ValueError('Observation spec must be discrete and bounded.')
    self._num_states = observation_spec.maximum + 1

    if not common_lib.is_categorical_spec(action_spec):
      raise ValueError('Action spec must be discrete and bounded.')
    self._num_actions = action_spec.maximum + 1
    self._dimension = 1 + (
        self._num_states * self._num_actions
        if self._solve_for_state_action_ratio else self._num_states)

    # For learning data weight
    self._divergence_limit = tf.convert_to_tensor(
        divergence_limit, dtype=tf.float32)
    if tf.rank(self._divergence_limit) < 1:
      self._divergence_limit = tf.expand_dims(self._divergence_limit, -1)
    self._two_sided_limit = tf.concat(
        [self._divergence_limit, self._divergence_limit], -1)
    self._num_limits = int(self._two_sided_limit.shape[0])
    # The lagrange multiplier w.r.t. data weight constraint
    self._alpha = tf.Variable(
        np.zeros(self._two_sided_limit.shape), dtype=tf.float32)

    self._algae_alpha = tf.convert_to_tensor(algae_alpha, dtype=tf.float32)
    if tf.rank(self._algae_alpha) < 1:
      self._algae_alpha = tf.expand_dims(self._algae_alpha, -1)
    if self._algae_alpha.shape[-1] != self._two_sided_limit.shape[-1]:
      self._algae_alpha *= tf.ones_like(self._two_sided_limit)
    self._algae_alpha_sign = 2 * (
        tf.cast(self._algae_alpha >= 0, tf.float32) - 0.5)

    self._divergence_type = divergence_type
    if self._divergence_type not in ['kl', 'rkl', 'chi2']:
      raise ValueError('Unsupported divergence type %s.' %
                       self._divergence_type)

    self._nu_learning_rate = nu_learning_rate
    self._zeta_learning_rate = zeta_learning_rate

    # We have two variables to counteract the bias introduced by algae_alpha.
    self._nu = tf.zeros([self._dimension, self._num_limits])
    self._nu2 = tf.zeros([self._dimension, self._num_limits])
    self._zeta = tf.zeros([self._dimension, self._num_limits])
    self._zeta2 = tf.zeros([self._dimension, self._num_limits])
    self._limit_episodes = limit_episodes
예제 #8
0
    def __init__(self,
                 dataset_spec,
                 gamma: Union[float, tf.Tensor],
                 reward_fn: Optional[Callable] = None,
                 solve_for_state_action_ratio: bool = True,
                 nu_learning_rate: Union[float, tf.Tensor] = 0.1,
                 zeta_learning_rate: Union[float, tf.Tensor] = 0.1,
                 kl_regularizer: Union[float, tf.Tensor] = 1.,
                 eps_std: Union[float, tf.Tensor] = 1):
        """Initializes the solver.

    Args:
      dataset_spec: The spec of the dataset that will be given.
      gamma: The discount factor to use.
      reward_fn: A function that takes in an EnvStep and returns the reward for
        that step. If not specified, defaults to just EnvStep.reward.
      solve_for_state_action_ratio: Whether to solve for state-action density
        ratio. Defaults to True. When solving an environment with a large
        state/action space (taxi), better to set this to False to avoid OOM
        issues.
      nu_learning_rate: Learning rate for nu.
      zeta_learning_rate: Learning rate for zeta.
      kl_regularizer: Regularization constant for D_kl(q || p).
      eps_std: epsilon standard deviation for sampling from the posterior.
    """
        self._dataset_spec = dataset_spec
        self._gamma = gamma
        if reward_fn is None:
            reward_fn = lambda env_step: env_step.reward
        self._reward_fn = reward_fn
        self._kl_regularizer = kl_regularizer
        self._eps_std = eps_std

        self._solve_for_state_action_ratio = solve_for_state_action_ratio
        if (not self._solve_for_state_action_ratio
                and not self._dataset_spec.has_log_probability()):
            raise ValueError('Dataset must contain log-probability when '
                             'solve_for_state_action_ratio is False.')

        # Get number of states/actions.
        observation_spec = self._dataset_spec.observation
        action_spec = self._dataset_spec.action
        if not common_lib.is_categorical_spec(observation_spec):
            raise ValueError('Observation spec must be discrete and bounded.')
        self._num_states = observation_spec.maximum + 1

        if not common_lib.is_categorical_spec(action_spec):
            raise ValueError('Action spec must be discrete and bounded.')
        self._num_actions = action_spec.maximum + 1
        self._dimension = (self._num_states * self._num_actions
                           if self._solve_for_state_action_ratio else
                           self._num_states)

        self._td_residuals = np.zeros([self._dimension, self._dimension])
        self._total_weights = np.zeros([self._dimension])
        self._initial_weights = np.zeros([self._dimension])

        self._nu_optimizer = tf.keras.optimizers.Adam(nu_learning_rate)
        self._zeta_optimizer = tf.keras.optimizers.Adam(zeta_learning_rate)

        # Initialize variational Bayes parameters
        self._nu_mu = tf.Variable(tf.zeros([self._dimension]))
        self._nu_log_sigma = tf.Variable(tf.zeros([self._dimension]))
        self._prior_mu = tf.Variable(tf.zeros([self._dimension]),
                                     trainable=True)
        self._prior_log_sigma = tf.Variable(tf.zeros([self._dimension]),
                                            trainable=False)
예제 #9
0
    def __init__(self,
                 dataset_spec,
                 policy_optimizer,
                 policy_network,
                 mode,
                 ci_method,
                 delta_tail,
                 gamma: Union[float, tf.Tensor],
                 reward_fn: Callable = None,
                 clipping: Optional[float] = 2000.,
                 policy_regularizer: float = 0.,
                 q_network=None,
                 q_optimizer=None,
                 target_update_tau: Union[float, tf.Tensor] = 0.01,
                 target_update_period: int = 1,
                 num_samples: Optional[int] = None):
        """Initializes the importance sampling estimator.

    Args:
      dataset_spec: The spec of the dataset that will be given.
      policy_optimizer: The optimizer to use for learning policy.
      policy_network: The policy NN network.
      mode: Importance sampling estimator (e.g., "weighted-step-wise").
      ci_method: Method for constructing confidence intervals (e.g., "CH" for
        Chernoff-Hoeffding).
      delta_tail: Total probability quantile threshold (will be halved in code
        for 2-tail)
      gamma: The discount factor to use.
      reward_fn: A function that takes in an EnvStep and returns the reward for
        that step. If not specified, defaults to just EnvStep.reward.
      clipping: Threshold for clipping IS factor.
      policy_regularizer: float on policy regularizer.
      q_network: A function that returns the values for each observation and
        action. If specified, the Q-values are learned and used for
        doubly-robust estimation.
      q_optimizer: TF optimizer for q_network.
      target_update_tau: Rate at which to set target network parameters.
      target_update_period: Rate at which to set target network parameters.
      num_samples: Number of samples to take from policy to estimate average
        next state value. If actions are discrete, this defaults to computing
        average explicitly. If actions are not discrete, this defaults to using
        a single sample.
    """
        self._dataset_spec = dataset_spec
        self._policy_optimizer = policy_optimizer
        self._policy_network = policy_network
        if self._policy_network is not None:
            self._policy_network.create_variables()
        self._mode = mode
        self._ci_method = ci_method
        self._delta_tail = delta_tail
        self._gamma = gamma
        if reward_fn is None:
            reward_fn = lambda env_step: env_step.reward
        self._reward_fn = reward_fn
        self._clipping = clipping
        self._policy_regularizer = policy_regularizer

        self._q_network = q_network
        if self._q_network is not None:
            self._q_network.create_variables()
            self._target_network = self._q_network.copy(name='TargetQNetwork')
            self._target_network.create_variables()
            self._target_update_tau = target_update_tau
            self._target_update_period = target_update_period
            self._update_targets = self._get_target_updater(
                tau=self._target_update_tau, period=self._target_update_period)
            self._q_optimizer = q_optimizer
            self._initialize()

        self._num_samples = num_samples
        self._categorical_action = common_lib.is_categorical_spec(
            self._dataset_spec.action)
        if not self._categorical_action and self._num_samples is None:
            self._num_samples = 1