def __init__( self, q_network, q_network_target, reward_network, parameters: ContinuousActionModelParameters, ) -> None: self.double_q_learning = parameters.rainbow.double_q_learning self.minibatch_size = parameters.training.minibatch_size RLTrainer.__init__( self, parameters, use_gpu=False, additional_feature_types=None, gradient_handler=None, ) self.q_network = q_network self.q_network_target = q_network_target self._set_optimizer(parameters.training.optimizer) self.q_network_optimizer = self.optimizer_func( self.q_network.parameters(), lr=parameters.training.learning_rate, weight_decay=parameters.training.l2_decay, ) self.reward_network = reward_network self.reward_network_optimizer = self.optimizer_func( self.reward_network.parameters(), lr=parameters.training.learning_rate)
def __init__( self, parameters: ContinuousActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], use_gpu=False, additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES, ) -> None: self.warm_start_model_path = parameters.training.warm_start_model_path self.minibatch_size = parameters.training.minibatch_size self.state_normalization_parameters = state_normalization_parameters self.action_normalization_parameters = action_normalization_parameters self.num_state_features = get_num_output_features( state_normalization_parameters) self.num_action_features = get_num_output_features( action_normalization_parameters) self.num_features = self.num_state_features + self.num_action_features # ensure state and action IDs have no intersection overlapping_features = set( state_normalization_parameters.keys()) & set( action_normalization_parameters.keys()) assert len(overlapping_features) == 0, ( "There are some overlapping state and action features: " + str(overlapping_features)) if parameters.training.factorization_parameters is None: parameters.training.layers[0] = self.num_features parameters.training.layers[-1] = 1 else: parameters.training.factorization_parameters.state.layers[ 0] = self.num_state_features parameters.training.factorization_parameters.action.layers[ 0] = self.num_action_features RLTrainer.__init__(self, parameters, use_gpu, additional_feature_types, None) self.q_network = self._get_model(parameters.training) self.q_network_target = deepcopy(self.q_network) self._set_optimizer(parameters.training.optimizer) self.q_network_optimizer = self.optimizer_func( self.q_network.parameters(), lr=parameters.training.learning_rate) self.reward_network = self._get_model(parameters.training) self.reward_network_optimizer = self.optimizer_func( self.reward_network.parameters(), lr=parameters.training.learning_rate) if self.use_gpu: self.q_network.cuda() self.q_network_target.cuda() self.reward_network.cuda()
def __init__( self, parameters: DiscreteActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], use_gpu=False, additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES, gradient_handler=None, ) -> None: self.double_q_learning = parameters.rainbow.double_q_learning self.warm_start_model_path = parameters.training.warm_start_model_path self.minibatch_size = parameters.training.minibatch_size self._actions = parameters.actions if parameters.actions is not None else [] self.reward_shape = {} # type: Dict[int, float] if parameters.rl.reward_boost is not None and self._actions is not None: for k in parameters.rl.reward_boost.keys(): i = self._actions.index(k) self.reward_shape[i] = parameters.rl.reward_boost[k] if parameters.training.cnn_parameters is None: self.state_normalization_parameters: Optional[Dict[ int, NormalizationParameters]] = state_normalization_parameters self.num_features = get_num_output_features( state_normalization_parameters) parameters.training.layers[0] = self.num_features else: self.state_normalization_parameters = None parameters.training.layers[-1] = self.num_actions RLTrainer.__init__(self, parameters, use_gpu, additional_feature_types, gradient_handler) if parameters.rainbow.dueling_architecture: self.q_network = DuelingArchitectureQNetwork( parameters.training.layers, parameters.training.activations) else: self.q_network = GenericFeedForwardNetwork( parameters.training.layers, parameters.training.activations) self.q_network_target = deepcopy(self.q_network) self._set_optimizer(parameters.training.optimizer) self.q_network_optimizer = self.optimizer_func( self.q_network.parameters(), lr=parameters.training.learning_rate) self.reward_network = GenericFeedForwardNetwork( parameters.training.layers, parameters.training.activations) self.reward_network_optimizer = self.optimizer_func( self.reward_network.parameters(), lr=parameters.training.learning_rate) if self.use_gpu: self.q_network.cuda() self.q_network_target.cuda() self.reward_network.cuda()
def __init__(self, imitator, parameters: DiscreteActionModelParameters, use_gpu=False) -> None: self._set_optimizer(parameters.training.optimizer) self.minibatch_size = parameters.training.minibatch_size self.imitator = imitator self.imitator_optimizer = self.optimizer_func( imitator.parameters(), lr=parameters.training.learning_rate, weight_decay=parameters.training.l2_decay, ) RLTrainer.__init__(self, parameters, use_gpu=use_gpu)
def __init__( self, q_network, q_network_target, parameters: DiscreteActionModelParameters, use_gpu=False, metrics_to_score=None, ) -> None: RLTrainer.__init__( self, parameters, use_gpu=use_gpu, metrics_to_score=metrics_to_score, actions=parameters.actions, ) self.double_q_learning = parameters.rainbow.double_q_learning self.minibatch_size = parameters.training.minibatch_size self.minibatches_per_step = parameters.training.minibatches_per_step or 1 self._actions = parameters.actions if parameters.actions is not None else [] self.q_network = q_network self.q_network_target = q_network_target self._set_optimizer(parameters.training.optimizer) self.q_network_optimizer = self.optimizer_func( self.q_network.parameters(), lr=parameters.training.learning_rate, weight_decay=parameters.rainbow.c51_l2_decay, ) self.qmin = parameters.rainbow.qmin self.qmax = parameters.rainbow.qmax self.num_atoms = parameters.rainbow.num_atoms self.support = torch.linspace(self.qmin, self.qmax, self.num_atoms, device=self.device) self.reward_boosts = torch.zeros([1, len(self._actions)], device=self.device) if parameters.rl.reward_boost is not None: for k in parameters.rl.reward_boost.keys(): i = self._actions.index(k) self.reward_boosts[0, i] = parameters.rl.reward_boost[k]
def __init__( self, q_network, q_network_target, parameters: C51TrainerParameters, use_gpu=False, metrics_to_score=None, ) -> None: RLTrainer.__init__( self, parameters.rl, use_gpu=use_gpu, metrics_to_score=metrics_to_score, actions=parameters.actions, ) self.double_q_learning = parameters.double_q_learning self.minibatch_size = parameters.minibatch_size self.minibatches_per_step = parameters.minibatches_per_step or 1 self._actions = parameters.actions if parameters.actions is not None else [] self.q_network = q_network self.q_network_target = q_network_target self.q_network_optimizer = self._get_optimizer(q_network, parameters.optimizer) self.qmin = parameters.qmin self.qmax = parameters.qmax self.num_atoms = parameters.num_atoms self.support = torch.linspace(self.qmin, self.qmax, self.num_atoms, device=self.device) self.reward_boosts = torch.zeros([1, len(self._actions)], device=self.device) if parameters.rl.reward_boost is not None: for k in parameters.rl.reward_boost.keys(): i = self._actions.index(k) self.reward_boosts[0, i] = parameters.rl.reward_boost[k]
def __init__( self, parameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], min_action_range_tensor_serving: torch.Tensor, max_action_range_tensor_serving: torch.Tensor, use_gpu: bool = False, additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES, use_all_avail_gpus: bool = False, ) -> None: self.state_normalization_parameters = state_normalization_parameters self.action_normalization_parameters = action_normalization_parameters self.state_dim = get_num_output_features(state_normalization_parameters) self.action_dim = min_action_range_tensor_serving.shape[1] # Actor generates actions between -1 and 1 due to tanh output layer so # convert actions to range [-1, 1] before training. self.min_action_range_tensor_training = torch.ones(1, self.action_dim) * -1 self.max_action_range_tensor_training = torch.ones(1, self.action_dim) self.min_action_range_tensor_serving = min_action_range_tensor_serving self.max_action_range_tensor_serving = max_action_range_tensor_serving # Shared params self.warm_start_model_path = parameters.shared_training.warm_start_model_path self.minibatch_size = parameters.shared_training.minibatch_size self.final_layer_init = parameters.shared_training.final_layer_init self._set_optimizer(parameters.shared_training.optimizer) # Actor params self.actor_params = parameters.actor_training assert ( self.actor_params.activations[-1] == "tanh" ), "Actor final layer activation must be tanh" self.actor_params.layers[0] = self.state_dim self.actor_params.layers[-1] = self.action_dim self.noise_generator = OrnsteinUhlenbeckProcessNoise(self.action_dim) self.actor = ActorNet( self.actor_params.layers, self.actor_params.activations, self.final_layer_init, ) self.actor_target = deepcopy(self.actor) self.actor_optimizer = self.optimizer_func( self.actor.parameters(), lr=self.actor_params.learning_rate, weight_decay=self.actor_params.l2_decay, ) self.noise = self.noise_generator # Critic params self.critic_params = parameters.critic_training self.critic_params.layers[0] = self.state_dim self.critic_params.layers[-1] = 1 self.critic = CriticNet( self.critic_params.layers, self.critic_params.activations, self.final_layer_init, self.action_dim, ) self.critic_target = deepcopy(self.critic) self.critic_optimizer = self.optimizer_func( self.critic.parameters(), lr=self.critic_params.learning_rate, weight_decay=self.critic_params.l2_decay, ) RLTrainer.__init__(self, parameters, use_gpu, additional_feature_types, None) self.min_action_range_tensor_training = self.min_action_range_tensor_training.type( self.dtype ) self.max_action_range_tensor_training = self.max_action_range_tensor_training.type( self.dtype ) self.min_action_range_tensor_serving = self.min_action_range_tensor_serving.type( self.dtype ) self.max_action_range_tensor_serving = self.max_action_range_tensor_serving.type( self.dtype ) if self.use_gpu: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() if use_all_avail_gpus: self.actor = nn.DataParallel(self.actor) self.actor_target = nn.DataParallel(self.actor_target) self.critic = nn.DataParallel(self.critic) self.critic_target = nn.DataParallel(self.critic_target)
def __init__( self, actor_network, critic_network, parameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], min_action_range_tensor_serving: torch.Tensor, max_action_range_tensor_serving: torch.Tensor, use_gpu: bool = False, use_all_avail_gpus: bool = False, ) -> None: self.state_normalization_parameters = state_normalization_parameters self.action_normalization_parameters = action_normalization_parameters for param in self.action_normalization_parameters.values(): assert param.feature_type == CONTINUOUS, ( "DDPG Actor features must be set to continuous (set to " + param.feature_type + ")" ) self.state_dim = get_num_output_features(state_normalization_parameters) self.action_dim = min_action_range_tensor_serving.shape[1] self.num_features = self.state_dim + self.action_dim # Actor generates actions between -1 and 1 due to tanh output layer so # convert actions to range [-1, 1] before training. self.min_action_range_tensor_training = torch.ones(1, self.action_dim) * -1 self.max_action_range_tensor_training = torch.ones(1, self.action_dim) self.min_action_range_tensor_serving = min_action_range_tensor_serving self.max_action_range_tensor_serving = max_action_range_tensor_serving # Shared params self.warm_start_model_path = parameters.shared_training.warm_start_model_path self.minibatch_size = parameters.shared_training.minibatch_size self.final_layer_init = parameters.shared_training.final_layer_init self._set_optimizer(parameters.shared_training.optimizer) # Actor params self.actor_params = parameters.actor_training self.actor = actor_network self.actor_target = deepcopy(self.actor) self.actor_optimizer = self.optimizer_func( self.actor.parameters(), lr=self.actor_params.learning_rate, weight_decay=self.actor_params.l2_decay, ) self.noise = OrnsteinUhlenbeckProcessNoise(self.action_dim) # Critic params self.critic_params = parameters.critic_training self.critic = self.q_network = critic_network self.critic_target = deepcopy(self.critic) self.critic_optimizer = self.optimizer_func( self.critic.parameters(), lr=self.critic_params.learning_rate, weight_decay=self.critic_params.l2_decay, ) # ensure state and action IDs have no intersection overlapping_features = set(state_normalization_parameters.keys()) & set( action_normalization_parameters.keys() ) assert len(overlapping_features) == 0, ( "There are some overlapping state and action features: " + str(overlapping_features) ) RLTrainer.__init__(self, parameters, use_gpu, None) self.min_action_range_tensor_training = self.min_action_range_tensor_training.type( self.dtype ) self.max_action_range_tensor_training = self.max_action_range_tensor_training.type( self.dtype ) self.min_action_range_tensor_serving = self.min_action_range_tensor_serving.type( self.dtype ) self.max_action_range_tensor_serving = self.max_action_range_tensor_serving.type( self.dtype )
def __init__( self, parameters: DiscreteActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES, metrics_to_score=None, gradient_handler=None, use_all_avail_gpus: bool = False, ) -> None: self.double_q_learning = parameters.rainbow.double_q_learning self.warm_start_model_path = parameters.training.warm_start_model_path self.minibatch_size = parameters.training.minibatch_size self._actions = parameters.actions if parameters.actions is not None else [] if parameters.training.cnn_parameters is None: self.state_normalization_parameters: Optional[Dict[ int, NormalizationParameters]] = state_normalization_parameters self.num_features = get_num_output_features( state_normalization_parameters) logger.info("Number of state features: " + str(self.num_features)) parameters.training.layers[0] = self.num_features else: self.state_normalization_parameters = None parameters.training.layers[-1] = self.num_actions RLTrainer.__init__( self, parameters, use_gpu, additional_feature_types, metrics_to_score, gradient_handler, actions=self._actions, ) self.reward_boosts = torch.zeros([1, len(self._actions)]).type(self.dtype) if parameters.rl.reward_boost is not None: for k in parameters.rl.reward_boost.keys(): i = self._actions.index(k) self.reward_boosts[0, i] = parameters.rl.reward_boost[k] if parameters.rainbow.dueling_architecture: self.q_network = DuelingQNetwork( parameters.training.layers, parameters.training.activations, use_batch_norm=parameters.training.use_batch_norm, ) else: if parameters.training.cnn_parameters is None: self.q_network = FullyConnectedNetwork( parameters.training.layers, parameters.training.activations, use_noisy_linear_layers=parameters.training. use_noisy_linear_layers, min_std=parameters.training.weight_init_min_std, use_batch_norm=parameters.training.use_batch_norm, ) else: self.q_network = ConvolutionalNetwork( parameters.training.cnn_parameters, parameters.training.layers, parameters.training.activations, use_noisy_linear_layers=parameters.training. use_noisy_linear_layers, min_std=parameters.training.weight_init_min_std, use_batch_norm=parameters.training.use_batch_norm, ) self.q_network_target = deepcopy(self.q_network) self.q_network._name = "training" self.q_network_target._name = "target" self._set_optimizer(parameters.training.optimizer) self.q_network_optimizer = self.optimizer_func( self.q_network.parameters(), lr=parameters.training.learning_rate, weight_decay=parameters.training.l2_decay, ) self._init_cpe_networks(parameters, use_all_avail_gpus) if self.use_gpu: self.q_network.cuda() self.q_network_target.cuda() if use_all_avail_gpus: self.q_network = torch.nn.DataParallel(self.q_network) self.q_network_target = torch.nn.DataParallel( self.q_network_target)
def __init__( self, parameters: ContinuousActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES, gradient_handler=None, use_all_avail_gpus: bool = False, ) -> None: self.double_q_learning = parameters.rainbow.double_q_learning self.warm_start_model_path = parameters.training.warm_start_model_path self.minibatch_size = parameters.training.minibatch_size self.state_normalization_parameters = state_normalization_parameters self.action_normalization_parameters = action_normalization_parameters self.num_state_features = get_num_output_features( state_normalization_parameters) self.num_action_features = get_num_output_features( action_normalization_parameters) self.num_features = self.num_state_features + self.num_action_features # ensure state and action IDs have no intersection overlapping_features = set( state_normalization_parameters.keys()) & set( action_normalization_parameters.keys()) assert len(overlapping_features) == 0, ( "There are some overlapping state and action features: " + str(overlapping_features)) reward_network_layers = deepcopy(parameters.training.layers) reward_network_layers[0] = self.num_features reward_network_layers[-1] = 1 if parameters.rainbow.dueling_architecture: parameters.training.layers[0] = self.num_state_features parameters.training.layers[-1] = 1 elif parameters.training.factorization_parameters is None: parameters.training.layers[0] = self.num_features parameters.training.layers[-1] = 1 else: parameters.training.factorization_parameters.state.layers[ 0] = self.num_state_features parameters.training.factorization_parameters.action.layers[ 0] = self.num_action_features RLTrainer.__init__(self, parameters, use_gpu, additional_feature_types, gradient_handler) self.q_network = self._get_model( parameters.training, parameters.rainbow.dueling_architecture) self.q_network_target = deepcopy(self.q_network) self._set_optimizer(parameters.training.optimizer) self.q_network_optimizer = self.optimizer_func( self.q_network.parameters(), lr=parameters.training.learning_rate, weight_decay=parameters.training.l2_decay, ) self.reward_network = FullyConnectedNetwork( reward_network_layers, parameters.training.activations) self.reward_network_optimizer = self.optimizer_func( self.reward_network.parameters(), lr=parameters.training.learning_rate) if self.use_gpu: self.q_network.cuda() self.q_network_target.cuda() self.reward_network.cuda() if use_all_avail_gpus: self.q_network = torch.nn.DataParallel(self.q_network) self.q_network_target = torch.nn.DataParallel( self.q_network_target) self.reward_network = torch.nn.DataParallel( self.reward_network)
def __init__( self, parameters: ContinuousActionModelParameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], use_gpu: bool = False, additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES, metrics_to_score=None, gradient_handler=None, use_all_avail_gpus: bool = False, ) -> None: self.double_q_learning = parameters.rainbow.double_q_learning self.warm_start_model_path = parameters.training.warm_start_model_path self.minibatch_size = parameters.training.minibatch_size self.state_normalization_parameters = state_normalization_parameters self.action_normalization_parameters = action_normalization_parameters self.num_state_features = get_num_output_features( state_normalization_parameters ) self.num_action_features = get_num_output_features( action_normalization_parameters ) self.num_features = self.num_state_features + self.num_action_features # ensure state and action IDs have no intersection overlapping_features = set(state_normalization_parameters.keys()) & set( action_normalization_parameters.keys() ) assert len(overlapping_features) == 0, ( "There are some overlapping state and action features: " + str(overlapping_features) ) reward_network_layers = deepcopy(parameters.training.layers) reward_network_layers[0] = self.num_features reward_network_layers[-1] = 1 if parameters.rainbow.dueling_architecture: parameters.training.layers[0] = self.num_state_features parameters.training.layers[-1] = 1 elif parameters.training.factorization_parameters is None: parameters.training.layers[0] = self.num_features parameters.training.layers[-1] = 1 else: parameters.training.factorization_parameters.state.layers[ 0 ] = self.num_state_features parameters.training.factorization_parameters.action.layers[ 0 ] = self.num_action_features RLTrainer.__init__( self, parameters, use_gpu, additional_feature_types, metrics_to_score, gradient_handler, ) self.q_network = self._get_model( parameters.training, parameters.rainbow.dueling_architecture ) self.q_network_target = deepcopy(self.q_network) self._set_optimizer(parameters.training.optimizer) self.q_network_optimizer = self.optimizer_func( self.q_network.parameters(), lr=parameters.training.learning_rate, weight_decay=parameters.training.l2_decay, ) self.reward_network = FullyConnectedNetwork( reward_network_layers, parameters.training.activations ) self.reward_network_optimizer = self.optimizer_func( self.reward_network.parameters(), lr=parameters.training.learning_rate ) if self.use_gpu: self.q_network.cuda() self.q_network_target.cuda() self.reward_network.cuda() if use_all_avail_gpus: self.q_network = torch.nn.DataParallel(self.q_network) self.q_network_target = torch.nn.DataParallel(self.q_network_target) self.reward_network = torch.nn.DataParallel(self.reward_network)
def __init__( self, parameters, state_normalization_parameters: Dict[int, NormalizationParameters], action_normalization_parameters: Dict[int, NormalizationParameters], min_action_range_tensor_serving: torch.Tensor, max_action_range_tensor_serving: torch.Tensor, use_gpu: bool = False, additional_feature_types: AdditionalFeatureTypes = DEFAULT_ADDITIONAL_FEATURE_TYPES, use_all_avail_gpus: bool = False, ) -> None: self.state_normalization_parameters = state_normalization_parameters self.action_normalization_parameters = action_normalization_parameters for param in self.action_normalization_parameters.values(): assert param.feature_type == CONTINUOUS, ( "DDPG Actor features must be set to continuous (set to " + param.feature_type + ")" ) self.state_dim = get_num_output_features(state_normalization_parameters) self.action_dim = min_action_range_tensor_serving.shape[1] self.num_features = self.state_dim + self.action_dim # Actor generates actions between -1 and 1 due to tanh output layer so # convert actions to range [-1, 1] before training. self.min_action_range_tensor_training = torch.ones(1, self.action_dim) * -1 self.max_action_range_tensor_training = torch.ones(1, self.action_dim) self.min_action_range_tensor_serving = min_action_range_tensor_serving self.max_action_range_tensor_serving = max_action_range_tensor_serving # Shared params self.warm_start_model_path = parameters.shared_training.warm_start_model_path self.minibatch_size = parameters.shared_training.minibatch_size self.final_layer_init = parameters.shared_training.final_layer_init self._set_optimizer(parameters.shared_training.optimizer) # Actor params self.actor_params = parameters.actor_training assert ( self.actor_params.activations[-1] == "tanh" ), "Actor final layer activation must be tanh" self.actor_params.layers[0] = self.state_dim self.actor_params.layers[-1] = self.action_dim self.noise_generator = OrnsteinUhlenbeckProcessNoise(self.action_dim) self.actor = ActorNet( self.actor_params.layers, self.actor_params.activations, self.final_layer_init, ) self.actor_target = deepcopy(self.actor) self.actor_optimizer = self.optimizer_func( self.actor.parameters(), lr=self.actor_params.learning_rate, weight_decay=self.actor_params.l2_decay, ) self.noise = self.noise_generator # Critic params self.critic_params = parameters.critic_training self.critic_params.layers[0] = self.state_dim self.critic_params.layers[-1] = 1 self.critic = self.q_network = CriticNet( self.critic_params.layers, self.critic_params.activations, self.final_layer_init, self.action_dim, ) self.critic_target = deepcopy(self.critic) self.critic_optimizer = self.optimizer_func( self.critic.parameters(), lr=self.critic_params.learning_rate, weight_decay=self.critic_params.l2_decay, ) # ensure state and action IDs have no intersection overlapping_features = set(state_normalization_parameters.keys()) & set( action_normalization_parameters.keys() ) assert len(overlapping_features) == 0, ( "There are some overlapping state and action features: " + str(overlapping_features) ) RLTrainer.__init__(self, parameters, use_gpu, additional_feature_types, None) self.min_action_range_tensor_training = self.min_action_range_tensor_training.type( self.dtype ) self.max_action_range_tensor_training = self.max_action_range_tensor_training.type( self.dtype ) self.min_action_range_tensor_serving = self.min_action_range_tensor_serving.type( self.dtype ) self.max_action_range_tensor_serving = self.max_action_range_tensor_serving.type( self.dtype ) if self.use_gpu: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() if use_all_avail_gpus: self.actor = nn.DataParallel(self.actor) self.actor_target = nn.DataParallel(self.actor_target) self.critic = nn.DataParallel(self.critic) self.critic_target = nn.DataParallel(self.critic_target)