示例#1
0
        act_space = temp_env.action_space

        trainer_config = with_updates(
            base_dict=POKER_TRAINER_BASE_CONFIG,
            updates_dict={
                "multiagent": {
                    "policies": {
                        TRAIN_POLICY:
                        (TRAIN_POLICY_CLASS, obs_space, act_space, {
                            'model': train_model_config,
                        }),
                        STATIC_POLICY:
                        (STATIC_POLICY_CLASS, obs_space, act_space, {
                            'model': static_model_config,
                        }),
                    },
                    "policy_mapping_fn": train_policy_mapping_fn,
                    "policies_to_train": [TRAIN_POLICY],
                },
                "callbacks_after_trainer_init": [
                    init_static_policy_distribution_after_trainer_init_callback,
                ],
                "callbacks": {
                    "on_train_result":
                    stop_and_submit_if_not_improving_on_train_result_callback,
                    'on_episode_start':
                    sample_new_static_policy_weights_for_each_worker_on_episode_start,
                },
            })

        # save running script to file
 "INFO",
 "metrics_smoothing_episodes":
 10000,
 "memory_per_worker":
 7019430400,
 "num_envs_per_worker":
 1,
 "num_workers":
 num_workers,
 "num_gpus_per_worker":
 0,
 "env":
 POKER_ENV,
 "env_config":
 with_updates(base_dict=poker_env_config,
              updates_dict={
                  'num_envs': 64,
              }),
 "multiagent": {
     "policies": {
         TRAIN_POLICY: (SACDiscreteTFPolicy, obs_space, act_space, {
             'model': model_config,
         }),
         STATIC_POLICY: (SACDiscreteTFPolicy, obs_space, act_space, {
             'model': model_config,
         }),
     },
     "policy_mapping_fn": train_policy_mapping_fn,
     "policies_to_train": [TRAIN_POLICY],
 },
 "callbacks_after_trainer_init": [
     lambda trainer: trainer.save_policy_model_configs_to_json(),
trainer_config = with_updates(
    base_dict=POKER_TRAINER_BASE_CONFIG,
    updates_dict={
        "multiagent": {
            "policies": {
                TRAIN_POLICY:
                (TRAIN_POLICY_CLASS, obs_space, act_space, {
                    'model': train_model_config,
                }),
                STATIC_POLICY:
                (STATIC_POLICY_CLASS, obs_space, act_space, {
                    'model': static_model_config,
                }),
            },
            "policy_mapping_fn": train_policy_mapping_fn,
            "policies_to_train": [TRAIN_POLICY],
        },
        "callbacks_after_trainer_init": [
            claim_new_active_policy_after_trainer_init_callback,
            # evo_update,
        ],
        # "callbacks_after_optim_step": [
        #     evo_update,
        # ],
        "callbacks": {
            "on_train_result":
            all_on_train_result_callbacks,
            'on_episode_start':
            sample_new_static_policy_weights_for_each_worker_on_episode_start,
        },
    })
示例#4
0
if POKER_GAME_VERSION == LEDUC_POKER:
    POKER_ENV_CONFIG = {
        'version': POKER_GAME_VERSION,
    }
    SELECTED_CONFIG_KEY = POKER_ARCH1_MODEL_CONFIG_KEY
    ENV_CLASS = PokerMultiAgentEnv
    POKER_TRAINER_BASE_CONFIG = {
        "log_level": "DEBUG",
        "metrics_smoothing_episodes": 10000,
        "memory_per_worker": 1019430400,
        "num_envs_per_worker": 1,
        "num_workers": 2,
        "num_gpus_per_worker": 0.0,
        "env": POKER_ENV,
        "env_config": with_updates(base_dict=POKER_ENV_CONFIG, updates_dict={
            'num_envs': 1,
        }),

        "buffer_size": int(20000),
        "learning_starts": 10000,
        "tau": 0.01,
        "gamma": 1.0,
        "train_batch_size": 1024,
        "optimization": {
            "actor_learning_rate": 0.01,
            "critic_learning_rate": 0.01,
            "entropy_learning_rate": 0.01,
        },
        "max_entropy_target_proportion": 0.0,
        "batch_mode": 'complete_episodes',
        "num_gpus": 0,
示例#5
0
 "DEBUG",
 "metrics_smoothing_episodes":
 10000,
 "memory_per_worker":
 1019430400,
 "num_envs_per_worker":
 1,
 "num_workers":
 2,
 "num_gpus_per_worker":
 0.0,
 "env":
 POKER_ENV,
 "env_config":
 with_updates(base_dict=POKER_ENV_CONFIG,
              updates_dict={
                  'num_envs': 1,
              }),
 "buffer_size":
 int(20000),
 "learning_starts":
 10000,
 "tau":
 0.01,
 "gamma":
 1.0,
 "train_batch_size":
 1024,
 "optimization": {
     "actor_learning_rate": 0.01,
     "critic_learning_rate": 0.01,
     "entropy_learning_rate": 0.01,