def __init__( self, model_params: DiscreteActionModelParameters, state_normalization: Dict[int, NormalizationParameters], use_gpu: bool, use_all_avail_gpus: bool, ): logger.info("Running DQN workflow with params:") logger.info(model_params) model_params = model_params trainer = create_dqn_trainer_from_params( model_params, state_normalization, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus, ) trainer = update_model_for_warm_start(trainer) assert type(trainer) == DQNTrainer, "Warm started wrong model type: " + str( type(trainer) ) evaluator = Evaluator( model_params.actions, model_params.rl.gamma, trainer, metrics_to_score=trainer.metrics_to_score, ) super().__init__( DiscreteDqnBatchPreprocessor(Preprocessor(state_normalization, use_gpu)), trainer, evaluator, model_params.training.minibatch_size, )
def test_trainer_maxq(self): env = Env(self.state_dims, self.action_dims) env.seed(42) maxq_parameters = DiscreteActionModelParameters( actions=env.actions, rl=RLParameters(gamma=0.95, target_update_rate=0.9, maxq_learning=True), rainbow=RainbowDQNParameters(double_q_learning=True, dueling_architecture=False), training=TrainingParameters( layers=self.layers, activations=self.activations, minibatch_size=self.minibatch_size, learning_rate=0.25, optimizer="ADAM", ), ) maxq_trainer = create_dqn_trainer_from_params(maxq_parameters, env.normalization) logger.info("Generating constant_reward MDPs..") states, actions, rewards, next_states, next_actions, is_terminal, possible_actions, possible_next_actions = env.generate_samples_discrete( self.num_samples) logger.info("Preprocessing constant_reward MDPs..") for epoch in range(self.epochs): tdps = env.preprocess_samples_discrete( states, actions, rewards, next_states, next_actions, is_terminal, possible_actions, possible_next_actions, self.minibatch_size, ) logger.info("Training.. " + str(epoch)) for tdp in tdps: maxq_trainer.train(tdp) logger.info(" ".join([ "Training epoch", str(epoch), "average q values", str(torch.mean(maxq_trainer.all_action_scores)), ])) # Q value should converge to very close to 20 avg_q_value_after_training = torch.mean(maxq_trainer.all_action_scores) self.assertLess(avg_q_value_after_training, 22) self.assertGreater(avg_q_value_after_training, 18)
def _train(self, model_params, env): np.random.seed(0) random.seed(0) torch.manual_seed(0) env.seed(42) trainer = create_dqn_trainer_from_params(model_params, env.normalization) logger.info("Generating constant_reward MDPs..") states, actions, rewards, next_states, next_actions, is_terminal, possible_actions, possible_next_actions = env.generate_samples_discrete( self.num_samples) logger.info("Preprocessing constant_reward MDPs..") for epoch in range(self.epochs): tdps = env.preprocess_samples_discrete( states, actions, rewards, next_states, next_actions, is_terminal, possible_actions, possible_next_actions, model_params.training.minibatch_size, ) logger.info("Training.. " + str(epoch)) for tdp in tdps: trainer.train(tdp) logger.info(" ".join([ "Training epoch", str(epoch), "average q values", str(torch.mean(trainer.all_action_scores)), ])) return trainer
def create_trainer(params: OpenAiGymParameters, env: OpenAIGymEnvironment): use_gpu = params.use_gpu model_type = params.model_type assert params.rl is not None rl_parameters = params.rl if model_type == ModelType.PYTORCH_DISCRETE_DQN.value: assert params.training is not None training_parameters = params.training assert params.rainbow is not None if env.img: assert ( training_parameters.cnn_parameters is not None ), "Missing CNN parameters for image input" training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels training_parameters._replace( cnn_parameters=training_parameters.cnn_parameters._replace( input_height=env.height, input_width=env.width, num_input_channels=env.num_input_channels, ) ) else: assert ( training_parameters.cnn_parameters is None ), "Extra CNN parameters for non-image input" discrete_trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters, rainbow=params.rainbow, evaluation=params.evaluation, ) trainer = create_dqn_trainer_from_params( discrete_trainer_params, env.normalization, use_gpu ) elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value: assert params.training is not None training_parameters = params.training assert params.rainbow is not None if env.img: assert ( training_parameters.cnn_parameters is not None ), "Missing CNN parameters for image input" training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels else: assert ( training_parameters.cnn_parameters is None ), "Extra CNN parameters for non-image input" continuous_trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, rainbow=params.rainbow ) trainer = create_parametric_dqn_trainer_from_params( continuous_trainer_params, env.normalization, env.normalization_action, use_gpu, ) elif model_type == ModelType.TD3.value: assert params.td3_training is not None assert params.critic_training is not None assert params.actor_training is not None td3_trainer_params = TD3ModelParameters( rl=rl_parameters, training=params.td3_training, q_network=params.critic_training, actor_network=params.actor_training, ) trainer = get_td3_trainer(env, td3_trainer_params, use_gpu) elif model_type == ModelType.SOFT_ACTOR_CRITIC.value: assert params.sac_training is not None assert params.critic_training is not None assert params.actor_training is not None trainer = get_sac_trainer( env, rl_parameters, params.sac_training, params.critic_training, params.actor_training, params.sac_value_training, use_gpu, ) elif model_type == ModelType.CEM.value: assert params.cem is not None cem_trainer_params = params.cem._replace(rl=params.rl) trainer = get_cem_trainer(env, cem_trainer_params, use_gpu) else: raise NotImplementedError("Model of type {} not supported".format(model_type)) return trainer
def create_trainer(model_type, params, rl_parameters, use_gpu, env): if model_type == ModelType.PYTORCH_DISCRETE_DQN.value: training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) rainbow_parameters = params["rainbow"] if isinstance(rainbow_parameters, dict): rainbow_parameters = RainbowDQNParameters(**rainbow_parameters) if env.img: assert ( training_parameters.cnn_parameters is not None ), "Missing CNN parameters for image input" if isinstance(training_parameters.cnn_parameters, dict): training_parameters.cnn_parameters = CNNParameters( **training_parameters.cnn_parameters ) training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels training_parameters.cnn_parameters.input_height = env.height training_parameters.cnn_parameters.input_width = env.width training_parameters.cnn_parameters.num_input_channels = ( env.num_input_channels ) else: assert ( training_parameters.cnn_parameters is None ), "Extra CNN parameters for non-image input" trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, ) trainer = create_dqn_trainer_from_params( trainer_params, env.normalization, use_gpu ) elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value: training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) rainbow_parameters = params["rainbow"] if isinstance(rainbow_parameters, dict): rainbow_parameters = RainbowDQNParameters(**rainbow_parameters) if env.img: assert ( training_parameters.cnn_parameters is not None ), "Missing CNN parameters for image input" training_parameters.cnn_parameters.conv_dims[0] = env.num_input_channels else: assert ( training_parameters.cnn_parameters is None ), "Extra CNN parameters for non-image input" trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters ) trainer = create_parametric_dqn_trainer_from_params( trainer_params, env.normalization, env.normalization_action, use_gpu ) elif model_type == ModelType.TD3.value: trainer_params = TD3ModelParameters( rl=rl_parameters, training=TD3TrainingParameters( minibatch_size=params["td3_training"]["minibatch_size"], q_network_optimizer=OptimizerParameters( **params["td3_training"]["q_network_optimizer"] ), actor_network_optimizer=OptimizerParameters( **params["td3_training"]["actor_network_optimizer"] ), use_2_q_functions=params["td3_training"]["use_2_q_functions"], exploration_noise=params["td3_training"]["exploration_noise"], initial_exploration_ts=params["td3_training"]["initial_exploration_ts"], target_policy_smoothing=params["td3_training"][ "target_policy_smoothing" ], noise_clip=params["td3_training"]["noise_clip"], delayed_policy_update=params["td3_training"]["delayed_policy_update"], ), q_network=FeedForwardParameters(**params["td3_q_training"]), actor_network=FeedForwardParameters(**params["td3_actor_training"]), ) trainer = get_td3_trainer(env, trainer_params, use_gpu) elif model_type == ModelType.SOFT_ACTOR_CRITIC.value: value_network = None value_network_optimizer = None alpha_optimizer = None if params["sac_training"]["use_value_network"]: value_network = FeedForwardParameters(**params["sac_value_training"]) value_network_optimizer = OptimizerParameters( **params["sac_training"]["value_network_optimizer"] ) if "alpha_optimizer" in params["sac_training"]: alpha_optimizer = OptimizerParameters( **params["sac_training"]["alpha_optimizer"] ) entropy_temperature = params["sac_training"].get("entropy_temperature", None) target_entropy = params["sac_training"].get("target_entropy", None) trainer_params = SACModelParameters( rl=rl_parameters, training=SACTrainingParameters( minibatch_size=params["sac_training"]["minibatch_size"], use_2_q_functions=params["sac_training"]["use_2_q_functions"], use_value_network=params["sac_training"]["use_value_network"], q_network_optimizer=OptimizerParameters( **params["sac_training"]["q_network_optimizer"] ), value_network_optimizer=value_network_optimizer, actor_network_optimizer=OptimizerParameters( **params["sac_training"]["actor_network_optimizer"] ), entropy_temperature=entropy_temperature, target_entropy=target_entropy, alpha_optimizer=alpha_optimizer, ), q_network=FeedForwardParameters(**params["sac_q_training"]), value_network=value_network, actor_network=FeedForwardParameters(**params["sac_actor_training"]), ) trainer = get_sac_trainer(env, trainer_params, use_gpu) else: raise NotImplementedError("Model of type {} not supported".format(model_type)) return trainer
def create_trainer(model_type, params, rl_parameters, use_gpu, env): if model_type == ModelType.PYTORCH_DISCRETE_DQN.value: training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) rainbow_parameters = params["rainbow"] if isinstance(rainbow_parameters, dict): rainbow_parameters = RainbowDQNParameters(**rainbow_parameters) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" if isinstance(training_parameters.cnn_parameters, dict): training_parameters.cnn_parameters = CNNParameters( **training_parameters.cnn_parameters) training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels training_parameters.cnn_parameters.input_height = env.height training_parameters.cnn_parameters.input_width = env.width training_parameters.cnn_parameters.num_input_channels = ( env.num_input_channels) else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = DiscreteActionModelParameters( actions=env.actions, rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, ) trainer = create_dqn_trainer_from_params(trainer_params, env.normalization, use_gpu) elif model_type == ModelType.PYTORCH_PARAMETRIC_DQN.value: training_parameters = params["training"] if isinstance(training_parameters, dict): training_parameters = TrainingParameters(**training_parameters) rainbow_parameters = params["rainbow"] if isinstance(rainbow_parameters, dict): rainbow_parameters = RainbowDQNParameters(**rainbow_parameters) if env.img: assert (training_parameters.cnn_parameters is not None), "Missing CNN parameters for image input" training_parameters.cnn_parameters.conv_dims[ 0] = env.num_input_channels else: assert (training_parameters.cnn_parameters is None), "Extra CNN parameters for non-image input" trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters) trainer = create_parametric_dqn_trainer_from_params( trainer_params, env.normalization, env.normalization_action, use_gpu) elif model_type == ModelType.CONTINUOUS_ACTION.value: training_parameters = params["shared_training"] if isinstance(training_parameters, dict): training_parameters = DDPGTrainingParameters(**training_parameters) actor_parameters = params["actor_training"] if isinstance(actor_parameters, dict): actor_parameters = DDPGNetworkParameters(**actor_parameters) critic_parameters = params["critic_training"] if isinstance(critic_parameters, dict): critic_parameters = DDPGNetworkParameters(**critic_parameters) trainer_params = DDPGModelParameters( rl=rl_parameters, shared_training=training_parameters, actor_training=actor_parameters, critic_training=critic_parameters, ) action_range_low = env.action_space.low.astype(np.float32) action_range_high = env.action_space.high.astype(np.float32) trainer = DDPGTrainer( trainer_params, env.normalization, env.normalization_action, torch.from_numpy(action_range_low).unsqueeze(dim=0), torch.from_numpy(action_range_high).unsqueeze(dim=0), use_gpu, ) elif model_type == ModelType.SOFT_ACTOR_CRITIC.value: trainer_params = SACModelParameters( rl=rl_parameters, training=SACTrainingParameters( minibatch_size=params["sac_training"]["minibatch_size"], use_2_q_functions=params["sac_training"]["use_2_q_functions"], q_network_optimizer=OptimizerParameters( **params["sac_training"]["q_network_optimizer"]), value_network_optimizer=OptimizerParameters( **params["sac_training"]["value_network_optimizer"]), actor_network_optimizer=OptimizerParameters( **params["sac_training"]["actor_network_optimizer"]), entropy_temperature=params["sac_training"] ["entropy_temperature"], ), q_network=FeedForwardParameters(**params["sac_q_training"]), value_network=FeedForwardParameters( **params["sac_value_training"]), actor_network=FeedForwardParameters( **params["sac_actor_training"]), ) trainer = get_sac_trainer(env, trainer_params, use_gpu) else: raise NotImplementedError( "Model of type {} not supported".format(model_type)) return trainer