def __init__(self, observation_space, action_space, config):
     """
     Example of a config = {
         'actions': {0, 1, 2},
         'alpha': 0.1,
         'epsilon': 0.1,
         'gamma': 0.6,
         'seed': 42,
         'init': 0.0,
     }
     """
     Policy.__init__(self, observation_space, action_space, config)
     # Parameters
     self.set_of_actions = deepcopy(config['actions'])
     self.alpha = deepcopy(config['alpha'])
     self.gamma = deepcopy(config['gamma'])
     self.epsilon = deepcopy(config['epsilon'])
     self.qtable = QTable(self.set_of_actions,
                          default=config['init'],
                          seed=config['seed'])
     self.qtable_state_action_counter = QTable(self.set_of_actions,
                                               default=0)
     self.qtable_state_action_reward = QTable(self.set_of_actions,
                                              default=list())
     # self.qtable_new_state_action_total_reward = QTable(self.set_of_actions, default=list())
     self.rndgen = RandomState(config['seed'])
     # Logging
     self.stats = dict()
     self._reset_stats_values()
예제 #2
0
def build_q_models(policy: Policy, obs_space: gym.Space,
                   action_space: gym.Space,
                   config: TrainerConfigDict) -> ModelV2:

    if not isinstance(action_space, gym.spaces.Discrete):
        raise UnsupportedSpaceException(
            "Action space {} is not supported for DQN.".format(action_space))

    policy.q_model = ModelCatalog.get_model_v2(obs_space=obs_space,
                                               action_space=action_space,
                                               num_outputs=action_space.n,
                                               model_config=config["model"],
                                               framework=config["framework"],
                                               name=Q_SCOPE)

    policy.target_q_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=action_space.n,
        model_config=config["model"],
        framework=config["framework"],
        name=Q_TARGET_SCOPE)

    policy.q_func_vars = policy.q_model.variables()
    policy.target_q_func_vars = policy.target_q_model.variables()

    return policy.q_model
def load_metanash_pure_strat(policy: Policy, pure_strat_spec: StrategySpec):
    pure_strat_checkpoint_path = pure_strat_spec.metadata["checkpoint_path"]
    checkpoint_data = deepdish.io.load(path=pure_strat_checkpoint_path)
    weights = checkpoint_data["weights"]
    weights = {k.replace("_dot_", "."): v for k, v in weights.items()}
    policy.set_weights(weights=weights)
    policy.p2sro_policy_spec = pure_strat_spec
예제 #4
0
    def __init__(self, observation_space, action_space, config, dqn_config):
        Policy.__init__(self, observation_space, action_space, config)
        self.device = torch.device(f"cuda:{dqn_config['cuda_id']}" if torch.
                                   cuda.is_available() else "cpu")
        self.dqn_config = dqn_config
        self.epsilon = 1

        self.num_states = int(np.product(observation_space.shape))
        self.num_actions = action_space.n
        print(
            f'dqn state space:{self.num_states}, action space:{self.num_actions}'
        )

        # self.eval_net = DQNModule(self.num_states, self.num_actions).to(self.device)
        # self.target_net = DQNModule(self.num_states, self.num_actions).to(self.device)

        self.eval_net = DQNActionModule(self.device, self.num_states,
                                        self.num_actions).to(self.device)
        self.target_net = DQNActionModule(self.device, self.num_states,
                                          self.num_actions).to(self.device)

        self.target_net.load_state_dict(self.eval_net.state_dict())

        self.learn_step_counter = 0
        self.memory = replay_memory(dqn_config['replay_capacity'],
                                    num_result=6)

        self.optimizer = torch.optim.Adam(self.eval_net.parameters(),
                                          lr=dqn_config['lr'])
        # self.loss_func = nn.SmoothL1Loss()
        self.loss_func = nn.MSELoss().to(self.device)

        self.rand_action = 0
        self.greedy_action = 0
예제 #5
0
def spl_torch_loss(
        policy: Policy, model: ModelV2,
        dist_class: Type[TorchDistributionWrapper],
        train_batch: SampleBatch) -> Union[TensorType, List[TensorType]]:
    """The basic policy gradients loss function.

    Args:
        policy (Policy): The Policy to calculate the loss for.
        model (ModelV2): The Model to calculate the loss for.
        dist_class (Type[ActionDistribution]: The action distr. class.
        train_batch (SampleBatch): The training data.

    Returns:
        Union[TensorType, List[TensorType]]: A single loss tensor or a list
            of loss tensors.
    """
    # Pass the training data through our model to get distribution parameters.
    dist_inputs, _ = model.from_batch(train_batch)
    # Create an action distribution object.
    predictions = dist_class(dist_inputs, model)

    targets = []
    if policy.config["learn_action"]:
        targets.append(train_batch[SampleBatch.ACTIONS])
    if policy.config["learn_reward"]:
        targets.append(train_batch[SampleBatch.REWARDS])
    assert len(targets) > 0
    targets = torch.cat(targets, dim=0)

    # Save the loss in the policy object for the spl_stats below.
    policy.spl_loss = policy.config["loss_fn"](predictions.dist.probs, targets)
    policy.entropy = predictions.dist.entropy().mean()

    return policy.spl_loss
예제 #6
0
    def __init__(self, observation_space, action_space, config):
        Policy.__init__(self, observation_space, action_space, config)
        self.observation_space = observation_space
        self.action_space = action_space
        self.config = config
        self.action_shape = action_space.n

        # GPU settings
        self.use_cuda = torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")

        # This attribute will be incremented every time learn_on_batch is called.
        self.iteration = 0

        # The current time step.
        self.current_step = 0

        # Agent parameters.
        self.lr = self.config["lr"]
        self.gamma = self.config["gamma"]
        self.target_update_frequency = self.config["target_update_frequency"]

        # Strategy
        self.strategy = \
            EpsilonGreedyStrategy(self.config["eps_start"], self.config["eps_end"], self.config["eps_decay"])

        # Replay memory
        self.memory = ReplayMemory(self.config["replay_memory_size"])

        # Policy network
        self.policy_net = ModelCatalog.get_model_v2(
            obs_space=self.observation_space,
            action_space=self.action_space,
            num_outputs=4,
            name="DQNModel",
            model_config=self.config["dqn_model"],
            framework="torch",
        ).to(self.device, non_blocking=True)

        # Target network
        self.target_net = ModelCatalog.get_model_v2(
            obs_space=self.observation_space,
            action_space=self.action_space,
            num_outputs=4,
            name="DQNModel",
            model_config=self.config["dqn_model"],
            framework="torch",
        ).to(self.device, non_blocking=True)

        # Set the weights & biases in the target_net to be the same as those in the policy_net.
        self.target_net.load_state_dict(self.policy_net.state_dict())
        # Put target_net in eval mode. This network will only be used for inference.
        self.target_net.eval()

        # Optimizer.
        self.optimizer = optim.RMSprop(self.policy_net.parameters())

        # The calculated loss.
        self.loss = 0
예제 #7
0
    def __init__(self, observation_space, action_space, config):
        Policy.__init__(self, observation_space, action_space, config)

        # You can replace this with whatever variable you want to save
        # the state of the policy in. `get_weights` and `set_weights`
        # are used for checkpointing the states and restoring the states
        # from a checkpoint.
        self.w = []
예제 #8
0
def build_q_losses(
    policy: Policy,
    model: ModelV2,
    dist_class: Type[TFActionDistribution],
    train_batch: SampleBatch,
) -> TensorType:
    """Constructs the loss for SimpleQTFPolicy.

    Args:
        policy (Policy): The Policy to calculate the loss for.
        model (ModelV2): The Model to calculate the loss for.
        dist_class (Type[ActionDistribution]): The action distribution class.
        train_batch (SampleBatch): The training data.

    Returns:
        TensorType: A single loss tensor.
    """
    # q network evaluation
    q_t = compute_q_values(policy,
                           policy.model,
                           train_batch[SampleBatch.CUR_OBS],
                           explore=False)

    # target q network evalution
    q_tp1 = compute_q_values(policy,
                             policy.target_model,
                             train_batch[SampleBatch.NEXT_OBS],
                             explore=False)
    if not hasattr(policy, "q_func_vars"):
        policy.q_func_vars = model.variables()
        policy.target_q_func_vars = policy.target_model.variables()

    # q scores for actions which we know were selected in the given state.
    one_hot_selection = tf.one_hot(
        tf.cast(train_batch[SampleBatch.ACTIONS], tf.int32),
        policy.action_space.n)
    q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1)

    # compute estimate of best possible value starting from state at t + 1
    dones = tf.cast(train_batch[SampleBatch.DONES], tf.float32)
    q_tp1_best_one_hot_selection = tf.one_hot(tf.argmax(q_tp1, 1),
                                              policy.action_space.n)
    q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1)
    q_tp1_best_masked = (1.0 - dones) * q_tp1_best

    # compute RHS of bellman equation
    q_t_selected_target = (train_batch[SampleBatch.REWARDS] +
                           policy.config["gamma"] * q_tp1_best_masked)

    # compute the error (potentially clipped)
    td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
    loss = tf.reduce_mean(huber_loss(td_error))

    # save TD error as an attribute for outside access
    policy.td_error = td_error

    return loss
    def __init__(self, observation_space, action_space, config):
        Policy.__init__(self, observation_space, action_space, config)
        self.method = Method()

        self.episode_length = episode_length = config['rollout_fragment_length']
        self.n_envs = n_envs = config['num_envs_per_worker']

        MAX_BUFFER_SIZE = 1000
        self.total_envs = total_envs = config['num_workers'] * config['num_envs_per_worker']

        self.buffer = TrajBuffer(episode_length, total_envs, MAX_BUFFER_SIZE)
예제 #10
0
def build_q_losses(policy: Policy, model, dist_class,
                   train_batch: SampleBatch) -> TensorType:
    """Constructs the loss for SimpleQTorchPolicy.

    Args:
        policy (Policy): The Policy to calculate the loss for.
        model (ModelV2): The Model to calculate the loss for.
        dist_class (Type[ActionDistribution]): The action distribution class.
        train_batch (SampleBatch): The training data.

    Returns:
        TensorType: A single loss tensor.
    """
    # q network evaluation
    q_t = compute_q_values(policy,
                           policy.q_model,
                           train_batch[SampleBatch.CUR_OBS],
                           explore=False,
                           is_training=True)

    # target q network evalution
    q_tp1 = compute_q_values(policy,
                             policy.target_q_model,
                             train_batch[SampleBatch.NEXT_OBS],
                             explore=False,
                             is_training=True)

    # q scores for actions which we know were selected in the given state.
    one_hot_selection = F.one_hot(train_batch[SampleBatch.ACTIONS].long(),
                                  policy.action_space.n)
    q_t_selected = torch.sum(q_t * one_hot_selection, 1)

    # compute estimate of best possible value starting from state at t + 1
    dones = train_batch[SampleBatch.DONES].float()
    q_tp1_best_one_hot_selection = F.one_hot(torch.argmax(q_tp1, 1),
                                             policy.action_space.n)
    q_tp1_best = torch.sum(q_tp1 * q_tp1_best_one_hot_selection, 1)
    q_tp1_best_masked = (1.0 - dones) * q_tp1_best

    # compute RHS of bellman equation
    q_t_selected_target = (train_batch[SampleBatch.REWARDS] +
                           policy.config["gamma"] * q_tp1_best_masked)

    # Compute the error (Square/Huber).
    td_error = q_t_selected - q_t_selected_target.detach()
    # loss = torch.mean(huber_loss(td_error)) # NFSP on Kuhn/Leduc poker fails with huber_loss

    loss = F.mse_loss(input=q_t_selected, target=q_t_selected_target.detach())

    # save TD error as an attribute for outside access
    policy.td_error = td_error
    policy.loss = loss
    return loss
def load_pure_strat(policy: Policy, pure_strat_spec, checkpoint_path: str = None):
    assert pure_strat_spec is None or checkpoint_path is None, "can only pass one or the other"
    if checkpoint_path is None:
        if hasattr(policy, "policy_spec") and pure_strat_spec == policy.policy_spec:
            return
        pure_strat_checkpoint_path = pure_strat_spec.metadata["checkpoint_path"]
    else:
        pure_strat_checkpoint_path = checkpoint_path

    checkpoint_data = deepdish.io.load(path=pure_strat_checkpoint_path)
    weights = checkpoint_data["weights"]
    weights = {k.replace("_dot_", "."): v for k, v in weights.items()}
    policy.set_weights(weights=weights)
    policy.policy_spec = pure_strat_spec
예제 #12
0
    def load_pure_strat_cached(policy: Policy, pure_strat_spec):

        pure_strat_checkpoint_path = pure_strat_spec.metadata[
            "checkpoint_path"]

        if pure_strat_checkpoint_path in cache:
            weights = cache[pure_strat_checkpoint_path]
        else:
            checkpoint_data = deepdish.io.load(path=pure_strat_checkpoint_path)
            weights = checkpoint_data["weights"]
            weights = {k.replace("_dot_", "."): v for k, v in weights.items()}
            cache[pure_strat_checkpoint_path] = weights

        policy.set_weights(weights=weights)
        policy.policy_spec = pure_strat_spec
예제 #13
0
def pg_tf_loss(
        policy: Policy, model: ModelV2, dist_class: Type[ActionDistribution],
        train_batch: SampleBatch) -> Union[TensorType, List[TensorType]]:
    """The basic policy gradients loss function.

    Args:
        policy (Policy): The Policy to calculate the loss for.
        model (ModelV2): The Model to calculate the loss for.
        dist_class (Type[ActionDistribution]: The action distr. class.
        train_batch (SampleBatch): The training data.

    Returns:
        Union[TensorType, List[TensorType]]: A single loss tensor or a list
            of loss tensors.
    """
    # Pass the training data through our model to get distribution parameters.
    dist_inputs, _ = model(train_batch)

    # Create an action distribution object.
    action_dist = dist_class(dist_inputs, model)

    # Calculate the vanilla PG loss based on:
    # L = -E[ log(pi(a|s)) * A]
    loss = -tf.reduce_mean(
        action_dist.logp(train_batch[SampleBatch.ACTIONS]) *
        tf.cast(train_batch[Postprocessing.ADVANTAGES], dtype=tf.float32))

    policy.policy_loss = loss

    return loss
예제 #14
0
def pg_torch_loss(
        policy: Policy, model: ModelV2,
        dist_class: Type[TorchDistributionWrapper],
        train_batch: SampleBatch) -> Union[TensorType, List[TensorType]]:
    """The basic policy gradients loss function.

    Args:
        policy (Policy): The Policy to calculate the loss for.
        model (ModelV2): The Model to calculate the loss for.
        dist_class (Type[ActionDistribution]: The action distr. class.
        train_batch (SampleBatch): The training data.

    Returns:
        Union[TensorType, List[TensorType]]: A single loss tensor or a list
            of loss tensors.
    """
    # Pass the training data through our model to get distribution parameters.
    dist_inputs, _ = model.from_batch(train_batch)

    # Create an action distribution object.
    action_dist = dist_class(dist_inputs, model)

    # Calculate the vanilla PG loss based on:
    # L = -E[ log(pi(a|s)) * A]
    log_probs = action_dist.logp(train_batch[SampleBatch.ACTIONS])

    # Save the loss in the policy object for the stats_fn below.
    policy.pi_err = -torch.mean(
        log_probs * train_batch[Postprocessing.ADVANTAGES])

    return policy.pi_err
예제 #15
0
def get_distribution_inputs_and_class(
        policy: Policy,
        q_model: ModelV2,
        obs_batch: TensorType,
        *,
        explore=True,
        is_training=True,
        **kwargs) -> Tuple[TensorType, type, List[TensorType]]:
    """Build the action distribution"""
    q_vals = compute_q_values(policy, q_model, obs_batch, explore, is_training)
    q_vals = q_vals[0] if isinstance(q_vals, tuple) else q_vals

    policy.q_values = q_vals
    policy.q_func_vars = q_model.variables()
    return policy.q_values, (TorchCategorical if policy.config["framework"]
                             == "torch" else Categorical), []  # state-outs
def spl_torch_loss(
    policy: Policy,
    model: ModelV2,
    dist_class: Type[TorchDistributionWrapper],
    train_batch: SampleBatch,
) -> Union[TensorType, List[TensorType]]:
    """The basic policy gradients loss function.

    Args:
        policy (Policy): The Policy to calculate the loss for.
        model (ModelV2): The Model to calculate the loss for.
        dist_class (Type[ActionDistribution]: The action distr. class.
        train_batch (SampleBatch): The training data.

    Returns:
        Union[TensorType, List[TensorType]]: A single loss tensor or a list
            of loss tensors.
    """
    # Pass the training data through our model to get distribution parameters.
    dist_inputs, _ = model.from_batch(train_batch)
    # Create an action distribution object.
    action_dist = dist_class(dist_inputs, model)
    if policy.config["explore"]:
        # Adding that because of a bug in TorchCategorical
        #  which modify dist_inputs through action_dist:
        _, _ = policy.exploration.get_exploration_action(
            action_distribution=action_dist,
            timestep=policy.global_timestep,
            explore=policy.config["explore"],
        )
        action_dist = dist_class(dist_inputs, policy.model)

    targets = []
    if policy.config["learn_action"]:
        targets.append(train_batch[SampleBatch.ACTIONS])
    if policy.config["learn_reward"]:
        targets.append(train_batch[SampleBatch.REWARDS])
    assert len(targets) > 0, (f"In config, use learn_action=True and/or "
                              f"learn_reward=True to specify which target to "
                              f"use in supervised learning")
    targets = torch.cat(targets, dim=0)

    # Save the loss in the policy object for the spl_stats below.
    policy.spl_loss = policy.config["loss_fn"](action_dist.dist.probs, targets)
    policy.entropy = action_dist.dist.entropy().mean()

    return policy.spl_loss
예제 #17
0
 def _get_log_from_policy(policy: Policy, policy_id: PolicyID) -> dict:
     """Gets the to_log var from a policy and rename its keys, adding the policy_id as a prefix."""
     to_log = {}
     if hasattr(policy, "to_log"):
         for k, v in policy.to_log.items():
             to_log[f"{k}/{policy_id}"] = v
         policy.to_log = {}
     return to_log
예제 #18
0
def compute_action(policy: Policy,
                   input_dict: Dict[str, np.ndarray],
                   explore: bool) -> Any:
    """Compute predicted action by the policy.

    .. note::
        It supports both Pytorch and Tensorflow backends (both eager and
        compiled graph modes).

    :param policy: `rllib.poli.Policy` to use to predict the action, which is
                   a thin wrapper around the actual policy model.
    :param input_dict: Input dictionary for forward as input of the policy.
    :param explore: Whether or not to enable exploration during sampling of the
                    action.
    """
    if policy.framework == 'torch':
        with torch.no_grad():
            input_dict = policy._lazy_tensor_dict(input_dict)
            action_logits, _ = policy.model(input_dict)
            action_dist = policy.dist_class(action_logits, policy.model)
            if explore:
                action_torch = action_dist.sample()
            else:
                action_torch = action_dist.deterministic_sample()
            action = action_torch.cpu().numpy()
    elif tf.compat.v1.executing_eagerly():
        action_logits, _ = policy.model(input_dict)
        action_dist = policy.dist_class(action_logits, policy.model)
        if explore:
            action_tf = action_dist.sample()
        else:
            action_tf = action_dist.deterministic_sample()
        action = action_tf.numpy()
    else:
        # This obscure piece of code takes advantage of already existing
        # placeholders to avoid creating new nodes to evalute computation
        # graph. It is several order of magnitude more efficient than calling
        # `action_logits, _ = model(input_dict).eval(session=policy._sess)`.
        feed_dict = {policy._input_dict[key]: value
                     for key, value in input_dict.items()
                     if key in policy._input_dict.keys()}
        feed_dict[policy._is_exploring] = explore
        action = policy._sess.run(
            policy._sampled_action, feed_dict=feed_dict)
    return action
예제 #19
0
        def _get_weights_from_policy(policy: Policy, policy_id: PolicyID) -> dict:
            """Gets the to_log var from a policy and rename its keys, adding the policy_id as a prefix."""
            to_log = {}
            weights = policy.get_weights()

            for k, v in weights.items():
                if isinstance(v, Iterable):
                    to_log[f"{policy_id}/{k}"] = v

            return to_log
예제 #20
0
def build_q_models(policy: Policy, obs_space: gym.spaces.Space,
                   action_space: gym.spaces.Space,
                   config: TrainerConfigDict) -> ModelV2:
    """Build q_model and target_q_model for Simple Q learning

    Note that this function works for both Tensorflow and PyTorch.

    Args:
        policy (Policy): The Policy, which will use the model for optimization.
        obs_space (gym.spaces.Space): The policy's observation space.
        action_space (gym.spaces.Space): The policy's action space.
        config (TrainerConfigDict):

    Returns:
        ModelV2: The Model for the Policy to use.
            Note: The target q model will not be returned, just assigned to
            `policy.target_q_model`.
    """
    if not isinstance(action_space, gym.spaces.MultiDiscrete):
        raise UnsupportedSpaceException(
            "Action space {} is not supported for DQN.".format(action_space))

    policy.q_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=action_space.nvec.max(),
        model_config=config["model"],
        framework=config["framework"],
        name=Q_SCOPE)

    policy.target_q_model = ModelCatalog.get_model_v2(
        obs_space=obs_space,
        action_space=action_space,
        num_outputs=action_space.nvec.max(),
        model_config=config["model"],
        framework=config["framework"],
        name=Q_TARGET_SCOPE)

    policy.q_func_vars = policy.q_model.variables()
    policy.target_q_func_vars = policy.target_q_model.variables()

    return policy.q_model
예제 #21
0
def compute_q_values(policy: Policy,
                     model: ModelV2,
                     obs: TensorType,
                     explore,
                     is_training=None) -> TensorType:
    _is_training = (is_training if is_training is not None else
                    policy._get_is_training_placeholder())
    model_out, _ = model(SampleBatch(obs=obs, _is_training=_is_training), [],
                         None)

    return model_out
예제 #22
0
파일: ma_dqn.py 프로젝트: songCNMS/vrp
    def __init__(self, eval_net, target_net, observation_space, action_space,
                 config, dqn_config):
        Policy.__init__(self, observation_space, action_space, config)
        self.device = torch.device(f"cuda:{dqn_config['cuda_id']}" if torch.
                                   cuda.is_available() else "cpu")
        self.dqn_config = dqn_config
        self.epsilon = 1

        self.prioritized_memory = dqn_config.get('prioritized_memry', False)

        # self.epsilon_delta = (dqn_config['update_period']  / dqn_config['replay_capacity'])
        self.epsilon_delta = 1e-3

        self.num_states = int(np.product(observation_space.shape))
        self.num_actions = action_space.n
        print(
            f'dqn state space:{self.num_states}, action space:{self.num_actions}'
        )

        # self.eval_net = DQNModule(self.num_states, self.num_actions, self.device).to(self.device)
        # self.target_net = DQNModule(self.num_states, self.num_actions, self.device).to(self.device)

        self.eval_net = eval_net.to(self.device)
        self.target_net = target_net.to(self.device)
        self.target_net.load_state_dict(self.eval_net.state_dict())

        self.learn_step_counter = 0

        if self.prioritized_memory:
            self.memory = PrioritizedMemory(dqn_config['replay_capacity'],
                                            num_result=5)
        else:
            self.memory = Memory(dqn_config['replay_capacity'], num_result=5)

        self.optimizer = torch.optim.Adam(self.eval_net.parameters(),
                                          lr=dqn_config['lr'])
        # self.loss_func = nn.SmoothL1Loss()
        self.loss_func = nn.MSELoss().to(self.device)

        self.rand_action = 0
        self.greedy_action = 0
예제 #23
0
def compute_q_values(policy: Policy,
                     model: ModelV2,
                     obs: TensorType,
                     explore,
                     is_training=None) -> TensorType:
    model_out, _ = model({
        SampleBatch.CUR_OBS: obs,
        "is_training": is_training
        if is_training is not None else policy._get_is_training_placeholder(),
    }, [], None)

    return model_out
예제 #24
0
def load_pure_strat(policy: Policy,
                    pure_strat_spec: StrategySpec = None,
                    checkpoint_path: str = None,
                    weights_key: str = "weights"):
    if pure_strat_spec is not None and checkpoint_path is not None:
        raise ValueError(
            "Can only pass pure_strat_spec or checkpoint_path but not both")
    if checkpoint_path is None:
        if hasattr(policy,
                   "policy_spec") and pure_strat_spec == policy.policy_spec:
            return
        pure_strat_checkpoint_path = pure_strat_spec.metadata[
            "checkpoint_path"]
    else:
        pure_strat_checkpoint_path = checkpoint_path

    weights = None

    try:
        num_load_attempts = 5
        for attempt in range(num_load_attempts):
            try:
                checkpoint_data = deepdish.io.load(
                    path=pure_strat_checkpoint_path)
                weights = checkpoint_data[weights_key]
                break
            except (HDF5ExtError, KeyError):
                if attempt + 1 == num_load_attempts:
                    raise
                time.sleep(1.0)

    #TODO use correct exception
    except Exception:
        with open(pure_strat_checkpoint_path, "rb") as pickle_file:
            checkpoint_data = cloudpickle.load(pickle_file)
            weights = checkpoint_data[weights_key]

    weights = {k.replace("_dot_", "."): v for k, v in weights.items()}
    policy.set_weights(weights=weights)
    policy.policy_spec = pure_strat_spec
예제 #25
0
파일: ma_dp_dqn.py 프로젝트: songCNMS/vrp
    def __init__(self, agent_id, eval_net, target_net, observation_space, action_space, config, dqn_config):
        Policy.__init__(self, observation_space, action_space, config)
        self.device = torch.device(f"cuda:{dqn_config['cuda_id']}" if torch.cuda.is_available() else "cpu")
        self.dqn_config = dqn_config
        self.epsilon = 1
        self.agent_id = agent_id

        # self.epsilon_delta = (dqn_config['update_period']  / dqn_config['replay_capacity'])
        self.epsilon_delta = 1e-3

        self.num_states =  int(np.product(observation_space.shape))
        self.num_actions = action_space.n
        print(f'dqn state space:{self.num_states}, action space:{self.num_actions}')

        # self.eval_net = DQNModule(self.num_states, self.num_actions, self.device).to(self.device)
        # self.target_net = DQNModule(self.num_states, self.num_actions, self.device).to(self.device)

        self.eval_net = eval_net.to(self.device)
        self.target_net = target_net.to(self.device)
        
        parameters = set()
        for layer in self.eval_net.dp_models.keys():
            parameters |= set(self.eval_net.dp_models[layer].parameters())
        self.optimizer = torch.optim.Adam(parameters, lr=dqn_config['lr'])
        self.learn_step_counter = 0

        self.memory = LayerMemory(dqn_config['replay_capacity'], num_result=5)

         
        # self.loss_func = nn.SmoothL1Loss()
        self.loss_func = nn.MSELoss().to(self.device)

        self.rand_action = 0
        self.greedy_action = 0

        self.x_action = []
        for i in range(self.num_actions):
            _action = np.zeros(self.eval_net.transition_model.num_actions)
            _action[self.agent_id*self.num_actions+i] = 1.0
            self.x_action.append(_action)
예제 #26
0
 def __init__(self, agent_id, observation_space, action_space, dqn_config,
              models):
     Policy.__init__(self, observation_space, action_space, dqn_config)
     self.max_num_nodes = dqn_config['max_num_nodes']
     self.dqn_config = dqn_config
     self.model_abstract_on = dqn_config['model_abstract_on']
     self.num_states = int(np.product(observation_space.shape))
     self.num_actions = action_space.n
     print(
         f'dqn state space:{self.num_states}, action space:{self.num_actions}'
     )
     self.epsilon = 1
     self.agent_id = agent_id
     self.learn_step_counter = 0
     self.eval_net = models['eval_net']
     self.target_net = models['target_net']
     self.all_layers = self.eval_net.get_all_layers()
     self.policies = {}
     for layer in self.all_layers:
         policy = DQNDPTorchPolicy(agent_id, observation_space,
                                   action_space, dqn_config, layer, models)
         self.policies[layer] = policy
예제 #27
0
def pg_loss_stats(policy: Policy,
                  train_batch: SampleBatch) -> Dict[str, TensorType]:
    """Returns the calculated loss in a stats dict.

    Args:
        policy (Policy): The Policy object.
        train_batch (SampleBatch): The data used for training.

    Returns:
        Dict[str, TensorType]: The stats dict.
    """

    return {
        "policy_loss":
        torch.mean(torch.stack(policy.get_tower_stats("policy_loss"))),
    }
예제 #28
0
def stats_fn(policy: Policy, batch: SampleBatch) -> Dict[str, TensorType]:
    return {"loss": torch.mean(torch.stack(policy.get_tower_stats("loss")))}
예제 #29
0
    def __init__(self, agent_id, observation_space, action_space, dqn_config,
                 layer, models):
        Policy.__init__(self, observation_space, action_space, dqn_config)
        self.total_device_num = torch.cuda.device_count()
        self.device = torch.device(f"cuda:{layer % self.total_device_num}"
                                   if torch.cuda.is_available() else "cpu")

        self.dqn_config = dqn_config
        self.epsilon = 1
        self.agent_id = agent_id
        self.layer = layer

        self.num_states = int(np.product(observation_space.shape))
        self.num_actions = action_space.n

        # self.epsilon_delta = (dqn_config['update_period']  / dqn_config['replay_capacity'])
        self.model_abstract_on = dqn_config['model_abstract_on']
        self.internal_update_freq = dqn_config['internal_update_freq']
        self.batch_size = dqn_config['batch_size']
        self.min_batch_size = dqn_config['min_batch_size']
        self.epsilon_delta = 1e-3
        self.encoder_feature_dim = self.num_states
        if self.model_abstract_on:
            self.encoder_feature_dim += dqn_config['encoder_feature_dim']
        self.discount = dqn_config['dist_distance_discount']
        self.bisim_coef = dqn_config['bisim_coef']

        # self.eval_net = DQNDPModule(self.encoder_feature_dim, self.num_actions, dqn_config)
        # self.target_net = DQNDPModule(self.encoder_feature_dim, self.num_actions, dqn_config)

        self.eval_net = models['eval_net']
        self.target_net = models['target_net']

        self.optimizer = torch.optim.Adam(self.eval_net.get_parameters(layer),
                                          lr=dqn_config['lr'])
        self.learn_step_counter = 0
        self.memory = LayerMemory(dqn_config['replay_capacity'],
                                  layer,
                                  self.batch_size,
                                  torch.device('cpu'),
                                  num_result=5)

        self.loss_func = nn.SmoothL1Loss().to(self.device)
        # self.loss_func = nn.MSELoss().to(self.device)

        self.rand_action = 0
        self.greedy_action = 0

        if self.model_abstract_on:
            decoder_parameters = set()
            self.target_encoder_model = models['target_encoder']

            self.eval_encoder_model = models['eval_encoder']
            self.eval_reward_model = models['eval_reward']
            self.eval_transition_model = models['eval_transition']

            self.encoder_optimizer = torch.optim.Adam(
                self.eval_encoder_model.get_parameters(layer),
                lr=dqn_config['lr'])
            decoder_parameters = (
                self.eval_transition_model.get_parameters(layer)
                | self.eval_reward_model.get_parameters(layer))
            self.decoder_optimizer = torch.optim.Adam(decoder_parameters,
                                                      lr=dqn_config['lr'])
예제 #30
0
파일: poker_utils.py 프로젝트: indylab/nxdo
 def set_policy_weights(policy: Policy, checkpoint_path: str):
     checkpoint_data = deepdish.io.load(path=checkpoint_path)
     weights = checkpoint_data["weights"]
     weights = {k.replace("_dot_", "."): v for k, v in weights.items()}
     policy.set_weights(weights)