Exemplo n.º 1
0
    def test_ddpg_trainer(self):
        environment = GridworldContinuous()
        samples = environment.generate_samples(500000, 0.25)
        trainer = DDPGTrainer(
            self.get_ddpg_parameters(),
            environment.normalization,
            environment.normalization_action,
            environment.min_action_range,
            environment.max_action_range,
        )
        evaluator = GridworldDDPGEvaluator(environment, True, DISCOUNT, False,
                                           samples)
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        critic_predictor = trainer.predictor(actor=False)
        evaluator.evaluate_critic(critic_predictor)
        for tdp in tdps:
            tdp.rewards = tdp.rewards.flatten()
            tdp.not_terminals = tdp.not_terminals.flatten()
            trainer.train(tdp)

        # Make sure actor predictor works
        actor = trainer.predictor(actor=True)
        evaluator.evaluate_actor(actor)

        # Evaluate critic predicor for correctness
        critic_predictor = trainer.predictor(actor=False)
        error = evaluator.evaluate_critic(critic_predictor)
        print("gridworld MAE: {0:.3f}".format(error))
Exemplo n.º 2
0
    def test_ddpg_trainer(self):
        environment = GridworldContinuous()
        samples = environment.generate_samples(200000, 1.0)
        epochs = 3
        trainer = DDPGTrainer(
            self.get_ddpg_parameters(),
            environment.normalization,
            environment.normalization_action,
        )
        evaluator = GridworldDDPGEvaluator(environment, True)
        tdps = environment.preprocess_samples(samples, self.minibatch_size)

        for epoch in range(epochs):
            print("On epoch {} of {}".format(epoch + 1, epochs))
            critic_predictor = trainer.predictor()
            evaluator.evaluate_critic(critic_predictor)
            for tdp in tdps:
                training_samples = [
                    tdp.states,
                    tdp.actions,
                    tdp.rewards.flatten(),
                    tdp.next_states,
                    None,
                    1 - tdp.not_terminals.flatten(),  # done
                    None,
                    None,
                    [1 for i in range(len(tdp.states))],  # time diff
                ]
                trainer.train(training_samples)

        critic_predictor = trainer.predictor()
        error = evaluator.evaluate_critic(critic_predictor)
        print("gridworld MAE: {0:.3f}".format(error))
Exemplo n.º 3
0
def train_network(params):
    logger.info("Running Parametric DQN workflow with params:")
    logger.info(params)

    # Set minibatch size based on # of devices being used to train
    params["shared_training"]["minibatch_size"] *= minibatch_size_multiplier(
        params["use_gpu"], params["use_all_avail_gpus"])

    rl_parameters = RLParameters(**params["rl"])
    training_parameters = DDPGTrainingParameters(**params["shared_training"])
    actor_parameters = DDPGNetworkParameters(**params["actor_training"])
    critic_parameters = DDPGNetworkParameters(**params["critic_training"])

    trainer_params = DDPGModelParameters(
        rl=rl_parameters,
        shared_training=training_parameters,
        actor_training=actor_parameters,
        critic_training=critic_parameters,
    )

    dataset = JSONDataset(params["training_data_path"],
                          batch_size=training_parameters.minibatch_size)
    state_normalization = read_norm_file(params["state_norm_data_path"])
    action_normalization = read_norm_file(params["action_norm_data_path"])

    num_batches = int(len(dataset) / training_parameters.minibatch_size)
    logger.info("Read in batch data set {} of size {} examples. Data split "
                "into {} batches of size {}.".format(
                    params["training_data_path"],
                    len(dataset),
                    num_batches,
                    training_parameters.minibatch_size,
                ))

    min_action_range_tensor_serving, max_action_range_tensor_serving = construct_action_scale_tensor(
        action_normalization, trainer_params.action_rescale_map)

    trainer = DDPGTrainer(
        trainer_params,
        state_normalization,
        action_normalization,
        min_action_range_tensor_serving,
        max_action_range_tensor_serving,
        use_gpu=params["use_gpu"],
        use_all_avail_gpus=params["use_all_avail_gpus"],
    )
    trainer = update_model_for_warm_start(trainer)
    state_preprocessor = Preprocessor(state_normalization, params["use_gpu"])
    action_preprocessor = Preprocessor(action_normalization, params["use_gpu"])

    start_time = time.time()
    for epoch in range(params["epochs"]):
        dataset.reset_iterator()
        for batch_idx in range(num_batches):
            report_training_status(batch_idx, num_batches, epoch,
                                   params["epochs"])
            batch = dataset.read_batch(batch_idx)
            tdp = preprocess_batch_for_training(
                state_preprocessor,
                batch,
                action_preprocessor=action_preprocessor)
            tdp.set_type(trainer.dtype)
            trainer.train(tdp)

    through_put = (len(dataset) * params["epochs"]) / (time.time() -
                                                       start_time)
    logger.info("Training finished. Processed ~{} examples / s.".format(
        round(through_put)))

    return export_trainer_and_predictor(trainer, params["model_output_path"])
Exemplo n.º 4
0
def train_network(params):
    logger.info("Running Parametric DQN workflow with params:")
    logger.info(params)

    # Set minibatch size based on # of devices being used to train
    params["shared_training"]["minibatch_size"] *= minibatch_size_multiplier(
        params["use_gpu"], params["use_all_avail_gpus"]
    )

    rl_parameters = RLParameters(**params["rl"])
    training_parameters = DDPGTrainingParameters(**params["shared_training"])
    actor_parameters = DDPGNetworkParameters(**params["actor_training"])
    critic_parameters = DDPGNetworkParameters(**params["critic_training"])

    trainer_params = DDPGModelParameters(
        rl=rl_parameters,
        shared_training=training_parameters,
        actor_training=actor_parameters,
        critic_training=critic_parameters,
    )

    dataset = JSONDataset(
        params["training_data_path"], batch_size=training_parameters.minibatch_size
    )
    state_normalization = read_norm_file(params["state_norm_data_path"])
    action_normalization = read_norm_file(params["action_norm_data_path"])

    num_batches = int(len(dataset) / training_parameters.minibatch_size)
    logger.info(
        "Read in batch data set {} of size {} examples. Data split "
        "into {} batches of size {}.".format(
            params["training_data_path"],
            len(dataset),
            num_batches,
            training_parameters.minibatch_size,
        )
    )

    min_action_range_tensor_serving, max_action_range_tensor_serving = construct_action_scale_tensor(
        action_normalization, trainer_params.action_rescale_map
    )

    trainer = DDPGTrainer(
        trainer_params,
        state_normalization,
        action_normalization,
        min_action_range_tensor_serving,
        max_action_range_tensor_serving,
        use_gpu=params["use_gpu"],
        use_all_avail_gpus=params["use_all_avail_gpus"],
    )
    trainer = update_model_for_warm_start(trainer)
    state_preprocessor = Preprocessor(state_normalization, False)
    action_preprocessor = Preprocessor(action_normalization, False)

    start_time = time.time()
    for epoch in range(params["epochs"]):
        dataset.reset_iterator()
        batch_idx = -1
        while True:
            batch_idx += 1
            report_training_status(batch_idx, num_batches, epoch, params["epochs"])
            batch = dataset.read_batch()
            if batch is None:
                break
            tdp = preprocess_batch_for_training(
                state_preprocessor, batch, action_preprocessor=action_preprocessor
            )
            tdp.set_type(trainer.dtype)
            trainer.train(tdp)

    through_put = (len(dataset) * params["epochs"]) / (time.time() - start_time)
    logger.info(
        "Training finished. Processed ~{} examples / s.".format(round(through_put))
    )

    return export_trainer_and_predictor(trainer, params["model_output_path"])