def __init__(self, backbone: LinearBackboneModel, action_space: gym.Space): super().__init__() self.backbone = backbone self.action_head = ActionHead(action_space=action_space, input_dim=self.backbone.output_dim) self.value_head = ValueHead(input_dim=self.backbone.output_dim)
def __init__(self, backbone: LinearBackboneModel, action_space: gym.Space, input_block: typing.Optional[nn.Module]=None): super().__init__() self.input_block = input_block self.backbone = backbone self.action_head = ActionHead( action_space=action_space, input_dim=self.backbone.output_dim ) self.value_head = ValueHead(input_dim=self.backbone.output_dim)
def __init__(self, input_block: BackboneModel, backbone: RnnLinearBackboneModel, action_space: gym.Space): super().__init__() self.input_block = input_block self.backbone = backbone self.action_head = ActionHead(action_space=action_space, input_dim=self.backbone.output_dim) self.value_head = ValueHead(input_dim=self.backbone.output_dim) assert self.backbone.is_recurrent, "Backbone must be a recurrent model"
def __init__(self, input_block: BackboneModel, policy_backbone: LinearBackboneModel, value_backbone: LinearBackboneModel, action_space: gym.Space): super().__init__() self.input_block = input_block self.policy_backbone = policy_backbone self.value_backbone = value_backbone self.action_head = ActionHead( action_space=action_space, input_dim=self.policy_backbone.output_dim) self.value_head = ValueHead(input_dim=self.value_backbone.output_dim)
class PolicyGradientModel(Model): """ For a policy gradient algorithm we need set of custom heads for our model """ def __init__(self, backbone: LinearBackboneModel, action_space: gym.Space): super().__init__() self.backbone = backbone self.action_head = ActionHead(action_space=action_space, input_dim=self.backbone.output_dim) self.value_head = ValueHead(input_dim=self.backbone.output_dim) def reset_weights(self): """ Initialize properly model weights """ self.backbone.reset_weights() self.action_head.reset_weights() self.value_head.reset_weights() def forward(self, observations): """ Calculate model outputs """ base_output = self.backbone(observations) action_output = self.action_head(base_output) value_output = self.value_head(base_output) return action_output, value_output def step(self, observation, argmax_sampling=False): """ Select actions based on model's output """ action_pd_params, value_output = self(observation) actions = self.action_head.sample(action_pd_params, argmax_sampling=argmax_sampling) # log likelihood of selected action logprob = self.action_head.logprob(actions, action_pd_params) return {'actions': actions, 'values': value_output, 'logprob': logprob} def logprob(self, action_sample, action_params): """ Calculate - log(prob) of selected actions """ return self.action_head.logprob(action_sample, action_params) def value(self, observation): """ Calculate only value head for given state """ base_output = self.backbone(observation) value_output = self.value_head(base_output) return value_output def entropy(self, action_pd_params): """ Entropy of a probability distribution """ return self.action_head.entropy(action_pd_params)
class StochasticPolicyModel(RlModel): """ Most generic policy gradient model class with a set of common actor-critic heads that share a single backbone """ def __init__(self, input_block: BackboneModel, backbone: LinearBackboneModel, action_space: gym.Space): super().__init__() self.input_block = input_block self.backbone = backbone self.action_head = ActionHead(action_space=action_space, input_dim=self.backbone.output_dim) self.value_head = ValueHead(input_dim=self.backbone.output_dim) def reset_weights(self): """ Initialize properly model weights """ self.input_block.reset_weights() self.backbone.reset_weights() self.action_head.reset_weights() self.value_head.reset_weights() def forward(self, observations): """ Calculate model outputs """ input_data = self.input_block(observations) base_output = self.backbone(input_data) action_output = self.action_head(base_output) value_output = self.value_head(base_output) return action_output, value_output def step(self, observation, argmax_sampling=False): """ Select actions based on model's output """ action_pd_params, value_output = self(observation) actions = self.action_head.sample(action_pd_params, argmax_sampling=argmax_sampling) # log likelihood of selected action logprobs = self.action_head.logprob(actions, action_pd_params) return { 'actions': actions, 'values': value_output, 'action:logprobs': logprobs } def evaluate(self, rollout: Rollout) -> Evaluator: """ Evaluate model on a rollout """ return StochasticPolicyEvaluator(self, rollout) def logprob(self, action_sample, policy_params): """ Calculate - log(prob) of selected actions """ return self.action_head.logprob(action_sample, policy_params) def value(self, observations): """ Calculate only value head for given state """ input_data = self.input_block(observations) base_output = self.backbone(input_data) value_output = self.value_head(base_output) return value_output def entropy(self, policy_params): """ Entropy of a probability distribution """ return self.action_head.entropy(policy_params)
class PolicyGradientRnnModel(RnnModel): """ For a policy gradient algorithm we need set of custom heads for our model """ def __init__(self, backbone: RnnLinearBackboneModel, action_space: gym.Space, input_block: typing.Optional[nn.Module]=None): super().__init__() self.input_block = input_block self.backbone = backbone self.action_head = ActionHead( action_space=action_space, input_dim=self.backbone.output_dim ) self.value_head = ValueHead(input_dim=self.backbone.output_dim) assert self.backbone.is_recurrent, "Backbone must be a recurrent model" @property def state_dim(self) -> int: """ Dimension of model state """ return self.backbone.state_dim def reset_weights(self): """ Initialize properly model weights """ self.backbone.reset_weights() self.action_head.reset_weights() self.value_head.reset_weights() def forward(self, observations, state): """ Calculate model outputs """ if self.input_block is not None: input_data = self.input_block(observations) else: input_data = observations base_output, new_state = self.backbone(input_data, state=state) action_output = self.action_head(base_output) value_output = self.value_head(base_output) return action_output, value_output, new_state def step(self, observations, state, argmax_sampling=False): """ Select actions based on model's output """ action_pd_params, value_output, new_state = self(observations, state) actions = self.action_head.sample(action_pd_params, argmax_sampling=argmax_sampling) # log likelihood of selected action logprobs = self.action_head.logprob(actions, action_pd_params) return { 'actions': actions, 'values': value_output, 'logprobs': logprobs, 'state': new_state } def evaluate(self, rollout: Rollout) -> Evaluator: """ Evaluate model on a rollout """ return PolicyGradientRnnEvaluator(self, rollout) def logprob(self, action_sample, policy_params): """ Calculate - log(prob) of selected actions """ return self.action_head.logprob(action_sample, policy_params) def value(self, observations, state): """ Calculate only value head for given state """ if self.input_block is not None: input_data = self.input_block(observations) else: input_data = observations base_output, new_state = self.backbone(input_data, state) value_output = self.value_head(base_output) return value_output def entropy(self, action_pd_params): """ Entropy of a probability distribution """ return self.action_head.entropy(action_pd_params)
class PolicyGradientModelSeparate(Model): """ For a policy gradient algorithm we need set of custom heads for our model """ def __init__(self, policy_backbone: LinearBackboneModel, value_backbone: LinearBackboneModel, action_space: gym.Space): super().__init__() self.policy_backbone = policy_backbone self.value_backbone = value_backbone self.action_head = ActionHead( action_space=action_space, input_dim=self.policy_backbone.output_dim) self.value_head = ValueHead(input_dim=self.value_backbone.output_dim) def reset_weights(self): """ Initialize properly model weights """ self.policy_backbone.reset_weights() self.value_backbone.reset_weights() self.action_head.reset_weights() self.value_head.reset_weights() def forward(self, observations): """ Calculate model outputs """ policy_base_output = self.policy_backbone(observations) value_base_output = self.value_backbone(observations) action_output = self.action_head(policy_base_output) value_output = self.value_head(value_base_output) return action_output, value_output def step(self, observation, argmax_sampling=False): """ Select actions based on model's output """ policy_params, values = self(observation) actions = self.action_head.sample(policy_params, argmax_sampling=argmax_sampling) # log likelihood of selected action logprobs = self.action_head.logprob(actions, policy_params) return {'actions': actions, 'values': values, 'logprobs': logprobs} def policy_parameters(self): """ Parameters of policy """ return it.chain(self.policy_backbone.parameters(), self.action_head.parameters()) def logprob(self, action_sample, policy_params): """ Calculate - log(prob) of selected actions """ return self.action_head.logprob(action_sample, policy_params) def value(self, observation): """ Calculate only value head for given state """ base_output = self.value_backbone(observation) value_output = self.value_head(base_output) return value_output def policy(self, observation): """ Calculate only action head for given state """ policy_base_output = self.policy_backbone(observation) policy_params = self.action_head(policy_base_output) return policy_params def evaluate(self, rollout: Rollout) -> Evaluator: """ Evaluate model on a rollout """ return PolicyGradientEvaluator(self, rollout) def entropy(self, policy_params): """ Entropy of a probability distribution """ return self.action_head.entropy(policy_params) def kl_divergence(self, pd_q, pd_p): """ Calculate KL-divergence between two probability distributions """ return self.action_head.kl_divergence(pd_q, pd_p)
class StochasticPolicyModelSeparate(RlModel): """ Policy gradient model class with an actor and critic heads that don't share a backbone """ def __init__(self, input_block: BackboneModel, policy_backbone: LinearBackboneModel, value_backbone: LinearBackboneModel, action_space: gym.Space): super().__init__() self.input_block = input_block self.policy_backbone = policy_backbone self.value_backbone = value_backbone self.action_head = ActionHead( action_space=action_space, input_dim=self.policy_backbone.output_dim) self.value_head = ValueHead(input_dim=self.value_backbone.output_dim) def reset_weights(self): """ Initialize properly model weights """ self.input_block.reset_weights() self.policy_backbone.reset_weights() self.value_backbone.reset_weights() self.action_head.reset_weights() self.value_head.reset_weights() def forward(self, observations): """ Calculate model outputs """ input_data = self.input_block(observations) policy_base_output = self.policy_backbone(input_data) value_base_output = self.value_backbone(input_data) action_output = self.action_head(policy_base_output) value_output = self.value_head(value_base_output) return action_output, value_output def step(self, observation, argmax_sampling=False): """ Select actions based on model's output """ policy_params, values = self(observation) actions = self.action_head.sample(policy_params, argmax_sampling=argmax_sampling) # log likelihood of selected action logprobs = self.action_head.logprob(actions, policy_params) return { 'actions': actions, 'values': values, 'action:logprobs': logprobs } def policy_parameters(self): """ Parameters of policy """ return it.chain(self.policy_backbone.parameters(), self.action_head.parameters()) def logprob(self, action_sample, policy_params): """ Calculate - log(prob) of selected actions """ return self.action_head.logprob(action_sample, policy_params) def value(self, observations): """ Calculate only value head for given state """ input_data = self.input_block(observations) base_output = self.value_backbone(input_data) value_output = self.value_head(base_output) return value_output def policy(self, observations): """ Calculate only action head for given state """ input_data = self.input_block(observations) policy_base_output = self.policy_backbone(input_data) policy_params = self.action_head(policy_base_output) return policy_params def evaluate(self, rollout: Rollout) -> Evaluator: """ Evaluate model on a rollout """ return StochasticPolicyEvaluator(self, rollout) def entropy(self, policy_params): """ Entropy of a probability distribution """ return self.action_head.entropy(policy_params) def kl_divergence(self, pd_q, pd_p): """ Calculate KL-divergence between two probability distributions """ return self.action_head.kl_divergence(pd_q, pd_p)