Python PGTrainer.with_updates 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: ray.rllib.agents.pg

클래스/타입: PGTrainer

메소드/함수: with_updates

hotexamples.com에서의 예제들: 3

Python PGTrainer.with_updates - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 ray.rllib.agents.pg.PGTrainer.with_updates에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

PGTrainer(30)

train(30)

stop(11)

save(4)

get_policy(3)

restore(3)

with_updates(3)

compute_action(2)

compute_single_action(1)

restore_from_object(1)

save_to_object(1)

예제 #1

파일 보기

    def test_overriding_default_resource_request(self):
        config = DEFAULT_CONFIG.copy()
        config["model"]["fcnet_hiddens"] = [10]
        config["num_workers"] = 2
        # 3 Trials: Can only run 2 at a time (num_cpus=6; needed: 3).
        config["lr"] = tune.grid_search([0.1, 0.01, 0.001])
        config["env"] = "CartPole-v0"
        config["framework"] = "tf"

        class DefaultResourceRequest:
            @classmethod
            def default_resource_request(cls, config):
                head_bundle = {"CPU": 1, "GPU": 0}
                child_bundle = {"CPU": 1}
                return PlacementGroupFactory(
                    [head_bundle, child_bundle, child_bundle],
                    strategy=config["placement_strategy"])

        # Create a trainer with an overridden default_resource_request
        # method that returns a PlacementGroupFactory.
        MyTrainer = PGTrainer.with_updates(mixins=[DefaultResourceRequest])
        tune.register_trainable("my_trainable", MyTrainer)

        global trial_executor
        trial_executor = RayTrialExecutor(reuse_actors=False)

        tune.run(
            "my_trainable",
            config=config,
            stop={"training_iteration": 2},
            trial_executor=trial_executor,
            callbacks=[_TestCallback()],
            verbose=2,
        )

예제 #2

파일 보기

def run_with_custom_entropy_loss(args, stop):
    """Example of customizing the loss function of an existing policy.

    This performs about the same as the default loss does."""
    def entropy_policy_gradient_loss(policy, model, dist_class, train_batch):
        logits, _ = model.from_batch(train_batch)
        action_dist = dist_class(logits, model)
        if args.torch:
            # required by PGTorchPolicy's stats fn.
            policy.pi_err = torch.tensor([0.0])
            return torch.mean(-0.1 * action_dist.entropy() -
                              (action_dist.logp(train_batch["actions"]) *
                               train_batch["advantages"]))
        else:
            return (-0.1 * action_dist.entropy() - tf.reduce_mean(
                action_dist.logp(train_batch["actions"]) *
                train_batch["advantages"]))

    policy_cls = PGTorchPolicy if args.torch else PGTFPolicy
    EntropyPolicy = policy_cls.with_updates(
        loss_fn=entropy_policy_gradient_loss)

    EntropyLossPG = PGTrainer.with_updates(
        name="EntropyPG", get_policy_class=lambda _: EntropyPolicy)

    run_heuristic_vs_learned(args, use_lstm=True, trainer=EntropyLossPG)

예제 #3

파일 보기

파일: inequity_aversion.py 프로젝트: tobiasbaumann1/amd

def main(debug):
    ray.init(num_cpus=os.cpu_count(), num_gpus=0)

    stop = {"episodes_total": 10 if debug else 400}

    env_config = {
        "max_steps": 10,
        "players_ids": ["player_row", "player_col"],
    }

    policies = {
        env_config["players_ids"][0]:
        (None, IteratedBoSAndPD.OBSERVATION_SPACE,
         IteratedBoSAndPD.ACTION_SPACE, {}),
        env_config["players_ids"][1]:
        (None, IteratedBoSAndPD.OBSERVATION_SPACE,
         IteratedBoSAndPD.ACTION_SPACE, {})
    }

    rllib_config = {
        "env":
        IteratedBoSAndPD,
        "env_config":
        env_config,
        "num_gpus":
        0,
        "num_workers":
        1,
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": (lambda agent_id: agent_id),
        },
        "framework":
        "torch",
        "gamma":
        0.5,
        "callbacks":
        miscellaneous.merge_callbacks(
            log.get_logging_callbacks_class(),
            postprocessing.OverwriteRewardWtWelfareCallback),
    }

    MyPGTorchPolicy = PGTorchPolicy.with_updates(
        postprocess_fn=miscellaneous.merge_policy_postprocessing_fn(
            postprocessing.get_postprocessing_welfare_function(
                add_inequity_aversion_welfare=True,
                inequity_aversion_beta=1.0,
                inequity_aversion_alpha=0.0,
                inequity_aversion_gamma=1.0,
                inequity_aversion_lambda=0.5),
            pg_torch_policy.post_process_advantages))
    MyPGTrainer = PGTrainer.with_updates(default_policy=MyPGTorchPolicy,
                                         get_policy_class=None)
    tune_analysis = tune.run(MyPGTrainer,
                             stop=stop,
                             checkpoint_freq=10,
                             config=rllib_config)
    ray.shutdown()
    return tune_analysis