예제 #1
0
    def __init__(self, backbone: LinearBackboneModel, action_space: gym.Space):
        super().__init__()

        self.backbone = backbone
        self.action_head = ActionHead(action_space=action_space,
                                      input_dim=self.backbone.output_dim)
        self.value_head = ValueHead(input_dim=self.backbone.output_dim)
예제 #2
0
    def __init__(self, backbone: LinearBackboneModel, action_space: gym.Space,
                 input_block: typing.Optional[nn.Module]=None):
        super().__init__()

        self.input_block = input_block
        self.backbone = backbone
        self.action_head = ActionHead(
            action_space=action_space,
            input_dim=self.backbone.output_dim
        )
        self.value_head = ValueHead(input_dim=self.backbone.output_dim)
예제 #3
0
    def __init__(self, input_block: BackboneModel,
                 backbone: RnnLinearBackboneModel, action_space: gym.Space):
        super().__init__()

        self.input_block = input_block
        self.backbone = backbone

        self.action_head = ActionHead(action_space=action_space,
                                      input_dim=self.backbone.output_dim)
        self.value_head = ValueHead(input_dim=self.backbone.output_dim)

        assert self.backbone.is_recurrent, "Backbone must be a recurrent model"
    def __init__(self, input_block: BackboneModel,
                 policy_backbone: LinearBackboneModel,
                 value_backbone: LinearBackboneModel, action_space: gym.Space):
        super().__init__()

        self.input_block = input_block
        self.policy_backbone = policy_backbone
        self.value_backbone = value_backbone

        self.action_head = ActionHead(
            action_space=action_space,
            input_dim=self.policy_backbone.output_dim)

        self.value_head = ValueHead(input_dim=self.value_backbone.output_dim)
예제 #5
0
class PolicyGradientModel(Model):
    """ For a policy gradient algorithm we need set of custom heads for our model """
    def __init__(self, backbone: LinearBackboneModel, action_space: gym.Space):
        super().__init__()

        self.backbone = backbone
        self.action_head = ActionHead(action_space=action_space,
                                      input_dim=self.backbone.output_dim)
        self.value_head = ValueHead(input_dim=self.backbone.output_dim)

    def reset_weights(self):
        """ Initialize properly model weights """
        self.backbone.reset_weights()
        self.action_head.reset_weights()
        self.value_head.reset_weights()

    def forward(self, observations):
        """ Calculate model outputs """
        base_output = self.backbone(observations)

        action_output = self.action_head(base_output)
        value_output = self.value_head(base_output)

        return action_output, value_output

    def step(self, observation, argmax_sampling=False):
        """ Select actions based on model's output """
        action_pd_params, value_output = self(observation)
        actions = self.action_head.sample(action_pd_params,
                                          argmax_sampling=argmax_sampling)

        # log likelihood of selected action
        logprob = self.action_head.logprob(actions, action_pd_params)

        return {'actions': actions, 'values': value_output, 'logprob': logprob}

    def logprob(self, action_sample, action_params):
        """ Calculate - log(prob) of selected actions """
        return self.action_head.logprob(action_sample, action_params)

    def value(self, observation):
        """ Calculate only value head for given state """
        base_output = self.backbone(observation)
        value_output = self.value_head(base_output)
        return value_output

    def entropy(self, action_pd_params):
        """ Entropy of a probability distribution """
        return self.action_head.entropy(action_pd_params)
예제 #6
0
class StochasticPolicyModel(RlModel):
    """
    Most generic policy gradient model class with a set of common actor-critic heads that share a single backbone
    """
    def __init__(self, input_block: BackboneModel,
                 backbone: LinearBackboneModel, action_space: gym.Space):
        super().__init__()

        self.input_block = input_block
        self.backbone = backbone
        self.action_head = ActionHead(action_space=action_space,
                                      input_dim=self.backbone.output_dim)
        self.value_head = ValueHead(input_dim=self.backbone.output_dim)

    def reset_weights(self):
        """ Initialize properly model weights """
        self.input_block.reset_weights()
        self.backbone.reset_weights()
        self.action_head.reset_weights()
        self.value_head.reset_weights()

    def forward(self, observations):
        """ Calculate model outputs """
        input_data = self.input_block(observations)

        base_output = self.backbone(input_data)

        action_output = self.action_head(base_output)
        value_output = self.value_head(base_output)

        return action_output, value_output

    def step(self, observation, argmax_sampling=False):
        """ Select actions based on model's output """
        action_pd_params, value_output = self(observation)
        actions = self.action_head.sample(action_pd_params,
                                          argmax_sampling=argmax_sampling)

        # log likelihood of selected action
        logprobs = self.action_head.logprob(actions, action_pd_params)

        return {
            'actions': actions,
            'values': value_output,
            'action:logprobs': logprobs
        }

    def evaluate(self, rollout: Rollout) -> Evaluator:
        """ Evaluate model on a rollout """
        return StochasticPolicyEvaluator(self, rollout)

    def logprob(self, action_sample, policy_params):
        """ Calculate - log(prob) of selected actions """
        return self.action_head.logprob(action_sample, policy_params)

    def value(self, observations):
        """ Calculate only value head for given state """
        input_data = self.input_block(observations)
        base_output = self.backbone(input_data)
        value_output = self.value_head(base_output)
        return value_output

    def entropy(self, policy_params):
        """ Entropy of a probability distribution """
        return self.action_head.entropy(policy_params)
예제 #7
0
class PolicyGradientRnnModel(RnnModel):
    """ For a policy gradient algorithm we need set of custom heads for our model """

    def __init__(self, backbone: RnnLinearBackboneModel, action_space: gym.Space,
                 input_block: typing.Optional[nn.Module]=None):
        super().__init__()

        self.input_block = input_block
        self.backbone = backbone

        self.action_head = ActionHead(
            action_space=action_space,
            input_dim=self.backbone.output_dim
        )
        self.value_head = ValueHead(input_dim=self.backbone.output_dim)

        assert self.backbone.is_recurrent, "Backbone must be a recurrent model"

    @property
    def state_dim(self) -> int:
        """ Dimension of model state """
        return self.backbone.state_dim

    def reset_weights(self):
        """ Initialize properly model weights """
        self.backbone.reset_weights()
        self.action_head.reset_weights()
        self.value_head.reset_weights()

    def forward(self, observations, state):
        """ Calculate model outputs """
        if self.input_block is not None:
            input_data = self.input_block(observations)
        else:
            input_data = observations

        base_output, new_state = self.backbone(input_data, state=state)

        action_output = self.action_head(base_output)
        value_output = self.value_head(base_output)

        return action_output, value_output, new_state

    def step(self, observations, state, argmax_sampling=False):
        """ Select actions based on model's output """
        action_pd_params, value_output, new_state = self(observations, state)
        actions = self.action_head.sample(action_pd_params, argmax_sampling=argmax_sampling)

        # log likelihood of selected action
        logprobs = self.action_head.logprob(actions, action_pd_params)

        return {
            'actions': actions,
            'values': value_output,
            'logprobs': logprobs,
            'state': new_state
        }

    def evaluate(self, rollout: Rollout) -> Evaluator:
        """ Evaluate model on a rollout """
        return PolicyGradientRnnEvaluator(self, rollout)

    def logprob(self, action_sample, policy_params):
        """ Calculate - log(prob) of selected actions """
        return self.action_head.logprob(action_sample, policy_params)

    def value(self, observations, state):
        """ Calculate only value head for given state """
        if self.input_block is not None:
            input_data = self.input_block(observations)
        else:
            input_data = observations

        base_output, new_state = self.backbone(input_data, state)
        value_output = self.value_head(base_output)

        return value_output

    def entropy(self, action_pd_params):
        """ Entropy of a probability distribution """
        return self.action_head.entropy(action_pd_params)
class PolicyGradientModelSeparate(Model):
    """ For a policy gradient algorithm we need set of custom heads for our model """
    def __init__(self, policy_backbone: LinearBackboneModel,
                 value_backbone: LinearBackboneModel, action_space: gym.Space):
        super().__init__()

        self.policy_backbone = policy_backbone
        self.value_backbone = value_backbone

        self.action_head = ActionHead(
            action_space=action_space,
            input_dim=self.policy_backbone.output_dim)

        self.value_head = ValueHead(input_dim=self.value_backbone.output_dim)

    def reset_weights(self):
        """ Initialize properly model weights """
        self.policy_backbone.reset_weights()
        self.value_backbone.reset_weights()

        self.action_head.reset_weights()
        self.value_head.reset_weights()

    def forward(self, observations):
        """ Calculate model outputs """
        policy_base_output = self.policy_backbone(observations)
        value_base_output = self.value_backbone(observations)

        action_output = self.action_head(policy_base_output)
        value_output = self.value_head(value_base_output)

        return action_output, value_output

    def step(self, observation, argmax_sampling=False):
        """ Select actions based on model's output """
        policy_params, values = self(observation)
        actions = self.action_head.sample(policy_params,
                                          argmax_sampling=argmax_sampling)

        # log likelihood of selected action
        logprobs = self.action_head.logprob(actions, policy_params)

        return {'actions': actions, 'values': values, 'logprobs': logprobs}

    def policy_parameters(self):
        """ Parameters of policy """
        return it.chain(self.policy_backbone.parameters(),
                        self.action_head.parameters())

    def logprob(self, action_sample, policy_params):
        """ Calculate - log(prob) of selected actions """
        return self.action_head.logprob(action_sample, policy_params)

    def value(self, observation):
        """ Calculate only value head for given state """
        base_output = self.value_backbone(observation)
        value_output = self.value_head(base_output)
        return value_output

    def policy(self, observation):
        """ Calculate only action head for given state """
        policy_base_output = self.policy_backbone(observation)
        policy_params = self.action_head(policy_base_output)
        return policy_params

    def evaluate(self, rollout: Rollout) -> Evaluator:
        """ Evaluate model on a rollout """
        return PolicyGradientEvaluator(self, rollout)

    def entropy(self, policy_params):
        """ Entropy of a probability distribution """
        return self.action_head.entropy(policy_params)

    def kl_divergence(self, pd_q, pd_p):
        """ Calculate KL-divergence between two probability distributions """
        return self.action_head.kl_divergence(pd_q, pd_p)
class StochasticPolicyModelSeparate(RlModel):
    """
    Policy gradient model class with an actor and critic heads that don't share a backbone
    """
    def __init__(self, input_block: BackboneModel,
                 policy_backbone: LinearBackboneModel,
                 value_backbone: LinearBackboneModel, action_space: gym.Space):
        super().__init__()

        self.input_block = input_block
        self.policy_backbone = policy_backbone
        self.value_backbone = value_backbone

        self.action_head = ActionHead(
            action_space=action_space,
            input_dim=self.policy_backbone.output_dim)

        self.value_head = ValueHead(input_dim=self.value_backbone.output_dim)

    def reset_weights(self):
        """ Initialize properly model weights """
        self.input_block.reset_weights()

        self.policy_backbone.reset_weights()
        self.value_backbone.reset_weights()

        self.action_head.reset_weights()
        self.value_head.reset_weights()

    def forward(self, observations):
        """ Calculate model outputs """
        input_data = self.input_block(observations)

        policy_base_output = self.policy_backbone(input_data)
        value_base_output = self.value_backbone(input_data)

        action_output = self.action_head(policy_base_output)
        value_output = self.value_head(value_base_output)

        return action_output, value_output

    def step(self, observation, argmax_sampling=False):
        """ Select actions based on model's output """
        policy_params, values = self(observation)
        actions = self.action_head.sample(policy_params,
                                          argmax_sampling=argmax_sampling)

        # log likelihood of selected action
        logprobs = self.action_head.logprob(actions, policy_params)

        return {
            'actions': actions,
            'values': values,
            'action:logprobs': logprobs
        }

    def policy_parameters(self):
        """ Parameters of policy """
        return it.chain(self.policy_backbone.parameters(),
                        self.action_head.parameters())

    def logprob(self, action_sample, policy_params):
        """ Calculate - log(prob) of selected actions """
        return self.action_head.logprob(action_sample, policy_params)

    def value(self, observations):
        """ Calculate only value head for given state """
        input_data = self.input_block(observations)
        base_output = self.value_backbone(input_data)
        value_output = self.value_head(base_output)
        return value_output

    def policy(self, observations):
        """ Calculate only action head for given state """
        input_data = self.input_block(observations)
        policy_base_output = self.policy_backbone(input_data)
        policy_params = self.action_head(policy_base_output)
        return policy_params

    def evaluate(self, rollout: Rollout) -> Evaluator:
        """ Evaluate model on a rollout """
        return StochasticPolicyEvaluator(self, rollout)

    def entropy(self, policy_params):
        """ Entropy of a probability distribution """
        return self.action_head.entropy(policy_params)

    def kl_divergence(self, pd_q, pd_p):
        """ Calculate KL-divergence between two probability distributions """
        return self.action_head.kl_divergence(pd_q, pd_p)