def get_sarsa_parameters_factorized(self): return ContinuousActionModelParameters( rl=RLParameters( gamma=DISCOUNT, target_update_rate=1.0, reward_burnin=100, maxq_learning=False, ), training=TrainingParameters( # These are used by reward network layers=[-1, 256, 128, -1], activations=["relu", "relu", "linear"], factorization_parameters=FactorizationParameters( state=FeedForwardParameters(layers=[-1, 128, 64], activations=["relu", "linear"]), action=FeedForwardParameters( layers=[-1, 128, 64], activations=["relu", "linear"]), ), minibatch_size=self.minibatch_size, learning_rate=0.03, optimizer="ADAM", ), rainbow=RainbowDQNParameters(double_q_learning=True, dueling_architecture=False), in_training_cpe=InTrainingCPEParameters(mdp_sampled_rate=0.1), )
def get_sarsa_trainer_reward_boost(self, environment, reward_shape): rl_parameters = RLParameters( gamma=DISCOUNT, target_update_rate=1.0, reward_burnin=10, maxq_learning=False, reward_boost=reward_shape, ) training_parameters = TrainingParameters( layers=[-1, -1], activations=["linear"], minibatch_size=self.minibatch_size, learning_rate=0.125, optimizer="ADAM", ) return DiscreteActionTrainer( DiscreteActionModelParameters( actions=environment.ACTIONS, rl=rl_parameters, training=training_parameters, rainbow=RainbowDQNParameters(double_q_learning=True, dueling_architecture=False), in_training_cpe=InTrainingCPEParameters(mdp_sampled_rate=0.1), ), environment.normalization, )
def get_sarsa_parameters_factorized(self): return ContinuousActionModelParameters( rl=RLParameters( gamma=DISCOUNT, target_update_rate=1.0, reward_burnin=100, maxq_learning=False, ), training=TrainingParameters( layers=[], activations=[], factorization_parameters=FactorizationParameters( state=FeedForwardParameters( layers=[-1, 128, 64, 32], activations=["relu", "relu", "linear"]), action=FeedForwardParameters( layers=[-1, 128, 64, 32], activations=["relu", "relu", "linear"]), ), minibatch_size=self.minibatch_size, learning_rate=0.05, optimizer="ADAM", ), knn=KnnParameters(model_type="DQN"), in_training_cpe_evaluation=InTrainingCPEParameters( mdp_sampled_rate=0.1), )
def get_sarsa_parameters(self): return ContinuousActionModelParameters( rl=RLParameters( gamma=DISCOUNT, target_update_rate=1.0, reward_burnin=100, maxq_learning=False, ), training=TrainingParameters( layers=[-1, 256, 128, -1], activations=["relu", "relu", "linear"], minibatch_size=self.minibatch_size, learning_rate=0.05, optimizer="ADAM", ), knn=KnnParameters(model_type="DQN"), rainbow=RainbowDQNParameters(double_q_learning=True, dueling_architecture=False), in_training_cpe=InTrainingCPEParameters(mdp_sampled_rate=0.1), )
def train_network(params): logger.info("Running Parametric DQN workflow with params:") logger.info(params) # Set minibatch size based on # of devices being used to train params["training"]["minibatch_size"] *= minibatch_size_multiplier( params["use_gpu"], params["use_all_avail_gpus"]) rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) if params["in_training_cpe"] is not None: in_training_cpe_parameters = InTrainingCPEParameters( **params["in_training_cpe"]) else: in_training_cpe_parameters = None trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, in_training_cpe=in_training_cpe_parameters, ) dataset = JSONDataset(params["training_data_path"], batch_size=training_parameters.minibatch_size) eval_dataset = JSONDataset(params["eval_data_path"], batch_size=training_parameters.minibatch_size) state_normalization = read_norm_file(params["state_norm_data_path"]) action_normalization = read_norm_file(params["action_norm_data_path"]) num_batches = int(len(dataset) / training_parameters.minibatch_size) logger.info("Read in batch data set {} of size {} examples. Data split " "into {} batches of size {}.".format( params["training_data_path"], len(dataset), num_batches, training_parameters.minibatch_size, )) trainer = ParametricDQNTrainer( trainer_params, state_normalization, action_normalization, use_gpu=params["use_gpu"], use_all_avail_gpus=params["use_all_avail_gpus"], ) trainer = update_model_for_warm_start(trainer) state_preprocessor = Preprocessor(state_normalization, False) action_preprocessor = Preprocessor(action_normalization, False) if trainer_params.in_training_cpe is not None: evaluator = Evaluator( None, trainer_params.rl.gamma, trainer, trainer_params.in_training_cpe.mdp_sampled_rate, metrics_to_score=trainer.metrics_to_score, ) else: evaluator = Evaluator( None, trainer_params.rl.gamma, trainer, float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset), metrics_to_score=trainer.metrics_to_score, ) start_time = time.time() for epoch in range(params["epochs"]): dataset.reset_iterator() for batch_idx in range(num_batches): report_training_status(batch_idx, num_batches, epoch, params["epochs"]) batch = dataset.read_batch(batch_idx) tdp = preprocess_batch_for_training( state_preprocessor, batch, action_preprocessor=action_preprocessor) tdp.set_type(trainer.dtype) trainer.train(tdp) eval_dataset.reset_iterator() accumulated_edp = None for batch_idx in range(num_batches): batch = eval_dataset.read_batch(batch_idx) tdp = preprocess_batch_for_training( state_preprocessor, batch, action_preprocessor=action_preprocessor) edp = EvaluationDataPage.create_from_tdp(tdp, trainer) if accumulated_edp is None: accumulated_edp = edp else: accumulated_edp = accumulated_edp.append(edp) accumulated_edp = accumulated_edp.compute_values(trainer.gamma) cpe_start_time = time.time() details = evaluator.evaluate_post_training(accumulated_edp) details.log() logger.info("CPE evaluation took {} seconds.".format(time.time() - cpe_start_time)) through_put = (len(dataset) * params["epochs"]) / (time.time() - start_time) logger.info("Training finished. Processed ~{} examples / s.".format( round(through_put))) return export_trainer_and_predictor(trainer, params["model_output_path"])
def train_network(params): logger.info("Running Parametric DQN workflow with params:") logger.info(params) # Set minibatch size based on # of devices being used to train params["training"]["minibatch_size"] *= minibatch_size_multiplier( params["use_gpu"], params["use_all_avail_gpus"]) rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) if params["in_training_cpe"] is not None: in_training_cpe_parameters = InTrainingCPEParameters( **params["in_training_cpe"]) else: in_training_cpe_parameters = None trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, in_training_cpe=in_training_cpe_parameters, ) dataset = JSONDataset(params["training_data_path"], batch_size=training_parameters.minibatch_size) state_normalization = read_norm_file(params["state_norm_data_path"]) action_normalization = read_norm_file(params["action_norm_data_path"]) num_batches = int(len(dataset) / training_parameters.minibatch_size) logger.info("Read in batch data set {} of size {} examples. Data split " "into {} batches of size {}.".format( params["training_data_path"], len(dataset), num_batches, training_parameters.minibatch_size, )) trainer = ParametricDQNTrainer( trainer_params, state_normalization, action_normalization, use_gpu=params["use_gpu"], use_all_avail_gpus=params["use_all_avail_gpus"], ) trainer = update_model_for_warm_start(trainer) state_preprocessor = Preprocessor(state_normalization, False) action_preprocessor = Preprocessor(action_normalization, False) if trainer_params.in_training_cpe is not None: evaluator = Evaluator( None, trainer_params.rl.gamma, trainer, trainer_params.in_training_cpe.mdp_sampled_rate, metrics_to_score=trainer.metrics_to_score, ) else: evaluator = Evaluator( None, trainer_params.rl.gamma, trainer, float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset), metrics_to_score=trainer.metrics_to_score, ) start_time = time.time() for epoch in range(params["epochs"]): dataset.reset_iterator() for batch_idx in range(num_batches): report_training_status(batch_idx, num_batches, epoch, params["epochs"]) batch = dataset.read_batch(batch_idx) tdp = preprocess_batch_for_training( state_preprocessor, batch, action_preprocessor=action_preprocessor) tdp.set_type(trainer.dtype) trainer.train(tdp) evaluator.collect_parametric_action_samples( mdp_ids=tdp.mdp_ids, sequence_numbers=tdp.sequence_numbers.cpu().numpy(), logged_actions=tdp.actions.cpu().numpy(), logged_possible_actions_mask=tdp.possible_actions_mask.cpu( ).numpy(), logged_rewards=tdp.rewards.cpu().numpy(), logged_propensities=tdp.propensities.cpu().numpy(), logged_terminals=(1.0 - tdp.not_terminal), possible_state_actions=tdp.possible_actions_state_concat.cpu(). numpy(), num_possible_actions=tdp.possible_actions_mask.shape[1], metrics=tdp.rewards.cpu().numpy( ), # Dummy until Parametric CPE on metrics implemented ) cpe_start_time = time.time() evaluator.recover_samples_to_be_unshuffled() evaluator.score_cpe(trainer_params.rl.gamma) evaluator.clear_collected_samples() logger.info("CPE evaluation took {} seconds.".format(time.time() - cpe_start_time)) through_put = (len(dataset) * params["epochs"]) / (time.time() - start_time) logger.info("Training finished. Processed ~{} examples / s.".format( round(through_put))) return export_trainer_and_predictor(trainer, params["model_output_path"])
def train_network(params): logger.info("Running Parametric DQN workflow with params:") logger.info(params) rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) if params["in_training_cpe"] is not None: in_training_cpe_parameters = InTrainingCPEParameters( **params["in_training_cpe"]) else: in_training_cpe_parameters = None trainer_params = ContinuousActionModelParameters( rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, in_training_cpe=in_training_cpe_parameters, ) dataset = JSONDataset(params["training_data_path"], batch_size=training_parameters.minibatch_size) state_normalization = read_norm_file(params["state_norm_data_path"]) action_normalization = read_norm_file(params["action_norm_data_path"]) num_batches = int(len(dataset) / training_parameters.minibatch_size) logger.info("Read in batch data set {} of size {} examples. Data split " "into {} batches of size {}.".format( params["training_data_path"], len(dataset), num_batches, training_parameters.minibatch_size, )) trainer = ParametricDQNTrainer(trainer_params, state_normalization, action_normalization, params["use_gpu"]) trainer = update_model_for_warm_start(trainer) state_preprocessor = Preprocessor(state_normalization, params["use_gpu"]) action_preprocessor = Preprocessor(action_normalization, params["use_gpu"]) if trainer_params.in_training_cpe is not None: evaluator = Evaluator( None, 100, trainer_params.rl.gamma, trainer, trainer_params.in_training_cpe.mdp_sampled_rate, ) else: evaluator = Evaluator( None, 100, trainer_params.rl.gamma, trainer, float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset), ) start_time = time.time() for epoch in range(params["epochs"]): for batch_idx in range(num_batches): report_training_status(batch_idx, num_batches, epoch, params["epochs"]) batch = dataset.read_batch(batch_idx) tdp = preprocess_batch_for_training( state_preprocessor, batch, action_preprocessor=action_preprocessor) trainer.train(tdp) trainer.evaluate(evaluator, None, None, tdp.episode_values) evaluator.collect_parametric_action_samples( mdp_ids=tdp.mdp_ids, sequence_numbers=tdp.sequence_numbers, logged_state_actions=np.concatenate( (tdp.states.cpu().numpy(), tdp.actions.cpu().numpy()), axis=1), logged_rewards=tdp.rewards, logged_propensities=tdp.propensities, logged_terminals=(1.0 - tdp.not_terminals), possible_state_actions=tdp.state_pas_concat.cpu().numpy(), pas_lens=tdp.possible_actions_lengths, ) cpe_start_time = time.time() evaluator.recover_samples_to_be_unshuffled() evaluator.score_cpe() evaluator.clear_collected_samples() logger.info("CPE evaluation took {} seconds.".format(time.time() - cpe_start_time)) through_put = (len(dataset) * params["epochs"]) / (time.time() - start_time) logger.info("Training finished. Processed ~{} examples / s.".format( round(through_put))) return export_trainer_and_predictor(trainer, params["model_output_path"])
def train_network(params): writer = None if params["model_output_path"] is not None: writer = SummaryWriter( log_dir=os.path.join( os.path.expanduser(params["model_output_path"]), "training_data" ) ) logger.info("Running DQN workflow with params:") logger.info(params) action_names = np.array(params["actions"]) rl_parameters = RLParameters(**params["rl"]) training_parameters = TrainingParameters(**params["training"]) rainbow_parameters = RainbowDQNParameters(**params["rainbow"]) if params["in_training_cpe"] is not None: in_training_cpe_parameters = InTrainingCPEParameters( **params["in_training_cpe"] ) else: in_training_cpe_parameters = None trainer_params = DiscreteActionModelParameters( actions=params["actions"], rl=rl_parameters, training=training_parameters, rainbow=rainbow_parameters, in_training_cpe=in_training_cpe_parameters, ) dataset = JSONDataset( params["training_data_path"], batch_size=training_parameters.minibatch_size ) state_normalization = read_norm_file(params["state_norm_data_path"]) num_batches = int(len(dataset) / training_parameters.minibatch_size) logger.info( "Read in batch data set {} of size {} examples. Data split " "into {} batches of size {}.".format( params["training_data_path"], len(dataset), num_batches, training_parameters.minibatch_size, ) ) trainer = DQNTrainer(trainer_params, state_normalization, params["use_gpu"]) trainer = update_model_for_warm_start(trainer) preprocessor = Preprocessor(state_normalization, params["use_gpu"]) if trainer_params.in_training_cpe is not None: evaluator = Evaluator( trainer_params.actions, 10, trainer_params.rl.gamma, trainer, trainer_params.in_training_cpe.mdp_sampled_rate, ) else: evaluator = Evaluator( trainer_params.actions, 10, trainer_params.rl.gamma, trainer, float(DEFAULT_NUM_SAMPLES_FOR_CPE) / len(dataset), ) start_time = time.time() for epoch in range(int(params["epochs"])): for batch_idx in range(num_batches): report_training_status(batch_idx, num_batches, epoch, int(params["epochs"])) batch = dataset.read_batch(batch_idx) tdp = preprocess_batch_for_training(preprocessor, batch, action_names) trainer.train(tdp) trainer.evaluate( evaluator, tdp.actions, None, tdp.rewards, tdp.episode_values ) evaluator.collect_discrete_action_samples( mdp_ids=tdp.mdp_ids, sequence_numbers=tdp.sequence_numbers.cpu().numpy(), states=tdp.states.cpu().numpy(), logged_actions=tdp.actions.cpu().numpy(), logged_rewards=tdp.rewards.cpu().numpy(), logged_propensities=tdp.propensities.cpu().numpy(), logged_terminals=np.invert( tdp.not_terminals.cpu().numpy().astype(np.bool) ), ) cpe_start_time = time.time() evaluator.recover_samples_to_be_unshuffled() evaluator.score_cpe() if writer is not None: evaluator.log_to_tensorboard(writer, epoch) evaluator.clear_collected_samples() logger.info( "CPE evaluation took {} seconds.".format(time.time() - cpe_start_time) ) through_put = (len(dataset) * int(params["epochs"])) / (time.time() - start_time) logger.info( "Training finished. Processed ~{} examples / s.".format(round(through_put)) ) if writer is not None: writer.close() return export_trainer_and_predictor(trainer, params["model_output_path"])