def __init__( self, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, lr_schedule: LearningRateSchedule, net_arch: Optional[List[int]] = None, activation_fn: Type[nn.Module] = nn.ReLU, features_extractor_class: Type[BaseFeaturesExtractor] = NatureCNN, features_extractor_kwargs: Optional[Dict[str, Any]] = None, normalize_images: bool = True, optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, optimizer_kwargs: Optional[Dict[str, Any]] = None, ): super(CnnPolicy, self).__init__( observation_space, action_space, lr_schedule, net_arch, activation_fn, features_extractor_class, features_extractor_kwargs, normalize_images, optimizer_class, optimizer_kwargs, ) register_policy("MlpPolicy", MlpPolicy) register_policy("CnnPolicy", CnnPolicy)
lr_schedule: Schedule, net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None, activation_fn: Type[nn.Module] = nn.ReLU, features_extractor_class: Type[BaseFeaturesExtractor] = CombinedExtractor, features_extractor_kwargs: Optional[Dict[str, Any]] = None, normalize_images: bool = True, optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, optimizer_kwargs: Optional[Dict[str, Any]] = None, n_critics: int = 2, share_features_extractor: bool = True, ): super(MultiInputPolicy, self).__init__( observation_space, action_space, lr_schedule, net_arch, activation_fn, features_extractor_class, features_extractor_kwargs, normalize_images, optimizer_class, optimizer_kwargs, n_critics, share_features_extractor, ) register_policy("MlpPolicy", MlpPolicy) register_policy("CnnPolicy", CnnPolicy) register_policy("MultiInputPolicy", MultiInputPolicy)
from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import RBF, ConstantKernel, _check_length_scale import timeit import copy from multiprocessing import Queue, Process, Manager import os from MetaBayesOpt.AquisitionFunctions import MLPAF from stable_baselines3.common.policies import register_policy MMetric = None __ACQUISITION__ = 'PI' register_policy('MLPAF', MLPAF) mem_bytes = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') maximum_search_points = 2**20 #2**int(log2(mem_bytes/512 - 1)) __measure_time__ = False __USE_CPP_BACKEND__ = None __REGRESSOR_LIB__ = "SKLearn" if __REGRESSOR_LIB__ == "GPY": try: import GPy except: print("importing GPY failed...") __REGRESSOR_LIB__ = "SKLEARN"
# Make batch out of tensor (consisting of n-stacked octrees) octree_batch = preprocess_stacked_octree_batch(observation, self.device) with th.no_grad(): actions = self._predict(octree_batch, deterministic=deterministic) # Convert to numpy actions = actions.cpu().numpy() if isinstance(self.action_space, gym.spaces.Box): if self.squash_output: # Rescale to proper domain when using squashing actions = self.unscale_action(actions) else: # Actions could be on arbitrary scale, so clip the actions to avoid # out of bound error (e.g. if sampling from a Gaussian distribution) actions = np.clip(actions, self.action_space.low, self.action_space.high) if not vectorized_env: if state is not None: raise ValueError( "Error: The environment must be vectorized when using recurrent policies." ) actions = actions[0] return actions, state register_policy("OctreeCnnPolicy", OctreeCnnPolicy)
# This file is here just to define MlpPolicy/CnnPolicy # that work for A2C from stable_baselines3.common.policies import ( ActorCriticCnnPolicy, ActorCriticPolicy, MultiInputActorCriticPolicy, register_policy, ) MlpPolicy = ActorCriticPolicy CnnPolicy = ActorCriticCnnPolicy MultiInputPolicy = MultiInputActorCriticPolicy register_policy("MlpPolicy", ActorCriticPolicy) register_policy("CnnPolicy", ActorCriticCnnPolicy) register_policy("MultiInputPolicy", MultiInputPolicy)
# This file is here just to define MlpPolicy/CnnPolicy # that work for A2C from stable_baselines3.common.policies import ActorCriticCnnPolicy, ActorCriticPolicy, register_policy MlpPolicy = ActorCriticPolicy CnnPolicy = ActorCriticCnnPolicy register_policy("MlpPolicy", ActorCriticPolicy) register_policy("CnnPolicy", ActorCriticCnnPolicy)
normalize_images: bool = True, optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam, optimizer_kwargs: Optional[Dict[str, Any]] = None): super(DQNPolicyAverageRewardAdjusted, self).__init__(observation_space, action_space, lr_schedule, net_arch, device, activation_fn, features_extractor_class, features_extractor_kwargs, normalize_images, optimizer_class, optimizer_kwargs) def make_q_net(self) -> QNetworkAverageRewardAdjusted: # Make sure we always have separate networks for feature extractors etc features_extractor = self.features_extractor_class( self.observation_space, **self.features_extractor_kwargs) features_dim = features_extractor.features_dim return QNetworkAverageRewardAdjusted( features_extractor=features_extractor, features_dim=features_dim, **self.net_args).to(self.device) def _predict(self, obs: th.Tensor, deterministic: bool = True) -> th.Tensor: action, q_values = self.q_net._predict(obs, deterministic=deterministic) return action register_policy("MlpAverageRewardAdjustedPolicy", DQNPolicyAverageRewardAdjusted)
import gym import torch as th from torch import nn from stable_baselines3.common.policies import BasePolicy, register_policy from stable_baselines3.common.torch_layers import BaseFeaturesExtractor, FlattenExtractor, NatureCNN, create_mlp from stable_baselines3.dqn.policies import DQNPolicy, QNetwork class SoftQNetwork(QNetwork): def _predict(self, observation: th.Tensor, deterministic: bool = True) -> th.Tensor: q_values = self.forward(observation) probs = nn.functional.softmax(q_values * 10, dim=1) m = th.distributions.Categorical(probs) action = m.sample().reshape(-1) return action class SQLPolicy(DQNPolicy): def make_q_net(self) -> SoftQNetwork: # Make sure we always have separate networks for features extractors etc net_args = self._update_features_extractor( self.net_args, features_extractor=None) return SoftQNetwork(**net_args).to(self.device) SoftMlpPolicy = SQLPolicy register_policy("SoftMlpPolicy", SoftMlpPolicy)