示例#1
0
    def test_cartpole_lstm(self):
        with ray_start_client_server():
            assert ray.util.client.ray.is_connected()

            config = dict(
                {
                    "num_sgd_iter": 5,
                    "model": {
                        "vf_share_layers": True,
                    },
                    "vf_loss_coeff": 0.0001,
                },
                **{
                    "env": StatelessCartPole,
                    # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
                    "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
                    "model": {
                        "use_lstm": True,
                        "lstm_cell_size": 256,
                        "lstm_use_prev_action": None,
                        "lstm_use_prev_reward": None,
                    },
                    "framework": "tf",
                    # Run with tracing enabled for tfe/tf2?
                    "eager_tracing": None,
                })

            stop = {
                "training_iteration": 200,
                "timesteps_total": 100000,
                "episode_reward_mean": 150.0,
            }

            results = tune.run("PPO", config=config, stop=stop, verbose=2)
            check_learning_achieved(results, 150.0)
示例#2
0
    def test_curiosity_on_partially_observable_domain(self):
        config = ppo.DEFAULT_CONFIG.copy()
        config["env"] = "mini-grid"
        config["env_config"] = {
            # Also works with:
            # - MiniGrid-MultiRoom-N4-S5-v0
            # - MiniGrid-MultiRoom-N2-S4-v0
            "name": "MiniGrid-Empty-8x8-v0",
            "framestack": 1,  # seems to work even w/o framestacking
        }
        config["horizon"] = 15  # Make it impossible to reach goal by chance.
        config["num_envs_per_worker"] = 4
        config["model"]["fcnet_hiddens"] = [256, 256]
        config["model"]["fcnet_activation"] = "relu"
        config["num_sgd_iter"] = 8
        config["num_workers"] = 0

        config["exploration_config"] = {
            "type": "Curiosity",
            # For the feature NN, use a non-LSTM fcnet (same as the one
            # in the policy model).
            "eta": 0.1,
            "lr": 0.0003,  # 0.0003 or 0.0005 seem to work fine as well.
            "feature_dim": 64,
            # No actual feature net: map directly from observations to feature
            # vector (linearly).
            "feature_net_config": {
                "fcnet_hiddens": [],
                "fcnet_activation": "relu",
            },
            "sub_exploration": {
                "type": "StochasticSampling",
            }
        }

        min_reward = 0.001
        stop = {
            "training_iteration": 25,
            "episode_reward_mean": min_reward,
        }
        for _ in framework_iterator(config, frameworks="torch"):
            # To replay:
            # trainer = ppo.PPOTrainer(config=config)
            # trainer.restore("[checkpoint file]")
            # env = env_maker(config["env_config"])
            # s = env.reset()
            # for _ in range(10000):
            #     s, r, d, _ = env.step(trainer.compute_action(s))
            #     if d:
            #         s = env.reset()
            #     env.render()

            results = tune.run("PPO", config=config, stop=stop, verbose=1)
            check_learning_achieved(results, min_reward)
            iters = results.trials[0].last_result["training_iteration"]
            print("Reached in {} iterations.".format(iters))
示例#3
0
def run_same_policy(args, stop):
    """Use the same policy for both agents (trivial case)."""
    config = {
        "env": RockPaperScissors,
        "framework": "torch" if args.torch else "tf",
    }

    results = tune.run("PG", config=config, stop=stop, verbose=1)

    if args.as_test:
        # Check vs 0.0 as we are playing a zero-sum game.
        check_learning_achieved(results, 0.0)
示例#4
0
    def train_model(self, run, config, stop, output_folder, checkpoint_freq, save_folder, latest_checkpoint_path, as_test,                             stop_reward):
        """
        Method for training the model, where the tune command of rllib is used.
        :param run: Deep Reinforcement Learning algorithm.
        :param config: Training configuation.
        :param stop: Stop condition during the training.
        :param output_folder: Path where the checkpoints are saved.
        :param checkpoint_freq: Frequency of saving the next checkpoint.
        :param save_folder: Detailed save path of the checkpoints.
        :param latest_checkpoint_path: Path to the latest checkpoint to continue the training. If it is None, a new session will be created.
        :param as_test: Boolean to test, if the training was successful.
        :param stop_reward: After the training, it will be checked, if the 'stop_reward' is reachable.
        """
        if stop is None:
            raise Exception("No stop condition was set for the training! You need at least one.")
        results = tune.run(run, config=config, stop=stop, checkpoint_at_end=True, local_dir=output_folder,
                           checkpoint_freq=checkpoint_freq, name=save_folder, restore=latest_checkpoint_path)

        if as_test:
            check_learning_achieved(results, stop_reward)
        ray.shutdown()
示例#5
0
        "evaluation_num_workers": args.evaluation_num_workers,
        # Evaluate every other training iteration (together
        # with every other call to Trainer.train()).
        "evaluation_interval": args.evaluation_interval,
        # Run for n episodes/timesteps (properly distribute load amongst
        # all eval workers). The longer it takes to evaluate, the more sense
        # it makes to use `evaluation_parallel_to_training=True`.
        # Use "auto" to run evaluation for roughly as long as the training
        # step takes.
        "evaluation_duration": args.evaluation_duration,
        # "episodes" or "timesteps".
        "evaluation_duration_unit": args.evaluation_duration_unit,

        # Use a custom callback that asserts that we are running the
        # configured exact number of episodes per evaluation OR - in auto
        # mode - run at least as many episodes as we have eval workers.
        "callbacks": AssertEvalCallback,
    }

    stop = {
        "training_iteration": args.stop_iters,
        "timesteps_total": args.stop_timesteps,
        "episode_reward_mean": args.stop_reward,
    }

    results = tune.run(args.run, config=config, stop=stop, verbose=2)

    if args.as_test:
        check_learning_achieved(results, args.stop_reward)
    ray.shutdown()
示例#6
0
def main():
    args = parser.parse_args()
    ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode)

    env_config = {
        "num_candidates": args.env_num_candidates,
        "resample_documents": not args.env_dont_resample_documents,
        "slate_size": args.env_slate_size,
        "seed": args.env_seed,
        "convert_to_discrete_action_space": args.run == "DQN",
    }

    config = {
        "env": (InterestEvolutionRecSimEnv if args.env == "interest-evolution"
                else InterestExplorationRecSimEnv if args.env
                == "interest-exploration" else LongTermSatisfactionRecSimEnv),
        "framework":
        args.framework,
        "num_gpus":
        args.num_gpus,
        "num_workers":
        args.num_workers,
        "env_config":
        env_config,
        "learning_starts":
        args.learning_starts,
    }

    # Perform a test run on the env with a random agent to see, what
    # the random baseline reward is.
    if args.random_test_episodes:
        print(f"Running {args.random_test_episodes} episodes to get a random "
              "agent's baseline reward ...")
        env = config["env"](config=env_config)
        env.reset()
        num_episodes = 0
        episode_rewards = []
        episode_reward = 0.0
        while num_episodes < args.random_test_episodes:
            action = env.action_space.sample()
            _, r, d, _ = env.step(action)
            episode_reward += r
            if d:
                num_episodes += 1
                episode_rewards.append(episode_reward)
                episode_reward = 0.0
                env.reset()
        print(f"Ran {args.random_test_episodes} episodes with a random agent "
              "reaching a mean episode return of "
              f"{np.mean(episode_rewards)}+/-{sem(episode_rewards)}.")

    if args.use_tune:
        stop = {
            "training_iteration": args.stop_iters,
            "timesteps_total": args.stop_timesteps,
            "episode_reward_mean": args.stop_reward,
        }

        if args.run == "SlateQ":
            config.update({
                "slateq_strategy": args.slateq_strategy,
            })
        results = tune.run(
            args.run,
            stop=stop,
            config=config,
            num_samples=args.tune_num_samples,
            verbose=2,
        )

        if args.as_test:
            check_learning_achieved(results, args.stop_reward)

    else:
        # Directly run using the trainer interface (good for debugging).
        if args.run == "DQN":
            trainer = dqn.DQNTrainer(config=config)
        else:
            config.update({
                "slateq_strategy": args.slateq_strategy,
            })
            trainer = slateq.SlateQTrainer(config=config)
        for i in range(10):
            result = trainer.train()
            print(pretty_print(result))
    ray.shutdown()
示例#7
0
        "evaluation_num_workers": 2,
        # Optional custom eval function.
        "custom_eval_function": eval_fn,
        # Enable evaluation, once per training iteration.
        "evaluation_interval": 1,
        # Run 10 episodes each time evaluation runs.
        "evaluation_duration": 10,
        # Override the env config for evaluation.
        "evaluation_config": {
            "env_config": {
                # Evaluate using LONGER corridor than trained on.
                "corridor_length": 5,
            },
        },
        "framework": args.framework,
    }

    stop = {
        "training_iteration": args.stop_iters,
        "timesteps_total": args.stop_timesteps,
        "episode_reward_mean": args.stop_reward,
    }

    results = tune.run("PG", config=config, stop=stop, verbose=1)

    # Check eval results (from eval workers using the custom function),
    # not results from the regular workers.
    if args.as_test:
        check_learning_achieved(results, args.stop_reward, evaluation=True)
    ray.shutdown()
示例#8
0
文件: meta_loop.py 项目: saffronh/asm
            "multiagent": {
                "policies_to_train": list(learned_planner_policy.keys()),
                "policies": {
                    **citizen_policies,
                    **learned_planner_policy
                },
                "policy_mapping_fn": policy_mapping_fn,
            },
        })

        # training loop
        citizen_results = tune.run(args.run, config=citizen_config, stop=stop)
        # stop after some criteria, then train the government (some convergence criteria of the policies?)
        planner_results = tune.run(args.run, config=planner_config, stop=stop)
        if args.as_test:
            check_learning_achieved(citizen_results, args.stop_reward)
            check_learning_achieved(planner_results, args.stop_reward)
    else:

        # initialize policies
        policies = {
            "planner_policy":
            (PPOTFPolicy, env.planner_observation_space,
             env.planner_action_space, {
                 "model": {
                     "custom_model": "action_mask_model",
                     "use_lstm": True,
                     "custom_model_config": {
                         "actual_obs_space":
                         env.planner_observation_space["actual_obs"]
                     }
示例#9
0
def main():
    args = parser.parse_args()
    ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode)

    env_config = {
        "num_candidates": args.env_num_candidates,
        "resample_documents": not args.env_dont_resample_documents,
        "slate_size": args.env_slate_size,
        "seed": args.env_seed,
        "convert_to_discrete_action_space": args.run == "DQN",
    }

    config = {
        "env": (InterestEvolutionRecSimEnv if args.env == "interest-evolution"
                else InterestExplorationRecSimEnv if args.env
                == "interest-exploration" else LongTermSatisfactionRecSimEnv),
        "hiddens": [
            1024,
            1024,
        ],
        "num_gpus":
        args.num_gpus,
        "num_workers":
        args.num_workers,
        "env_config":
        env_config,
        "lr_choice_model":
        0.003,
        "lr_q_model":
        0.003,
        "rollout_fragment_length":
        4,
        "exploration_config": {
            "epsilon_timesteps": 50000,
            "final_epsilon": 0.02,
        },
        "target_network_update_freq":
        1,
        "tau":
        5e-3,
        "evaluation_interval":
        1,
        "evaluation_num_workers":
        4,
        "evaluation_duration":
        200,
        "evaluation_duration_unit":
        "episodes",
        "evaluation_parallel_to_training":
        True,
    }

    # Perform a test run on the env with a random agent to see, what
    # the random baseline reward is.
    if args.random_test_episodes:
        print(f"Running {args.random_test_episodes} episodes to get a random "
              "agent's baseline reward ...")
        env = config["env"](config=env_config)
        env.reset()
        num_episodes = 0
        episode_rewards = []
        episode_reward = 0.0
        while num_episodes < args.random_test_episodes:
            action = env.action_space.sample()
            _, r, d, _ = env.step(action)
            episode_reward += r
            if d:
                num_episodes += 1
                episode_rewards.append(episode_reward)
                episode_reward = 0.0
                env.reset()
        print(f"Ran {args.random_test_episodes} episodes with a random agent "
              "reaching a mean episode return of "
              f"{np.mean(episode_rewards)}+/-{sem(episode_rewards)}.")

    if args.use_tune:
        stop = {
            "training_iteration": args.stop_iters,
            "timesteps_total": args.stop_timesteps,
            "episode_reward_mean": args.stop_reward,
        }

        time_signature = datetime.now().strftime("%Y-%m-%d_%H_%M_%S")
        name = f"SlateQ/{args.run}-seed{args.env_seed}-{time_signature}"
        if args.run == "SlateQ":
            config.update({
                "slateq_strategy": args.slateq_strategy,
            })
        results = tune.run(
            args.run,
            stop=stop,
            name=name,
            config=config,
            num_samples=args.tune_num_samples,
            verbose=2,
        )

        if args.as_test:
            check_learning_achieved(results, args.stop_reward)

    else:
        # Directly run using the trainer interface (good for debugging).
        if args.run == "DQN":
            trainer = dqn.DQNTrainer(config=config)
        else:
            config.update({
                "slateq_strategy": args.slateq_strategy,
            })
            trainer = slateq.SlateQTrainer(config=config)
        for i in range(10):
            result = trainer.train()
            print(pretty_print(result))
    ray.shutdown()
def test_amtft_ipd():
    from marltoolbox.examples.rllib_api.amtft_various_env import main
    ray.shutdown()  # Restart Ray defensively in case the ray connection is lost.
    tune_analysis_per_welfare, analysis_metrics_per_mode = main(debug=False, train_n_replicates=1, filter_utilitarian=False)
    for welfare_name, tune_analysis in tune_analysis_per_welfare.items():
        check_learning_achieved(tune_results=tune_analysis, reward=-204, min=True)
def test_ltft_ipd():
    from marltoolbox.examples.rllib_api.ltft_ipd import main
    ray.shutdown()  # Restart Ray defensively in case the ray connection is lost.
    tune_analysis_self_play, tune_analysis_naive_opponent = main(debug=False)
    check_learning_achieved(tune_results=tune_analysis_self_play, reward=-42, min=True)
    check_learning_achieved(tune_results=tune_analysis_naive_opponent, reward=-78, max=True)
def test_ppo_asym_coin_game():
    from marltoolbox.examples.rllib_api.ppo_asymmetric_coin_game import main
    ray.shutdown()  # Restart Ray defensively in case the ray connection is lost.
    tune_analysis = main(debug=False, stop_iters=70)
    check_learning_achieved(tune_results=tune_analysis, reward=20, min=True)
def test_pg_ipd():
    from marltoolbox.examples.rllib_api.pg_ipd import main
    ray.shutdown()  # Restart Ray defensively in case the ray connection is lost.
    tune_analysis = main(debug=False)
    check_learning_achieved(tune_results=tune_analysis, reward=-75, max=True)
示例#14
0
def main(algorithm: str, stop_iters: int, stop_timesteps: int,
         stop_reward: float, as_test: bool, baselines_path: str,
         num_samples: int, eval_amount: float, seed: int) -> None:
    algorithm = algorithm.upper()

    ModelCatalog.register_custom_model("bn_model", TorchBatchNormModel)
    ModelCatalog.register_custom_model("skip_model", TorchSkipConnectionModel)
    ModelCatalog.register_custom_model("custom_weights",
                                       TorchCustomWeightsModel)

    ray.init(include_dashboard=False)

    if baselines_path:
        baselines_path = Path(baselines_path)
    baseline_datas: List[pd.DataFrame] = get_baselines(baselines_path)
    all_protos: Set[str] = set(
        itertools.chain.from_iterable(baseline_data.index
                                      for baseline_data in baseline_datas))

    eval_baselines: List[pd.DataFrame] = list()
    if eval_amount:
        baseline_datas, eval_baselines = train_test_split(
            baseline_datas, test_size=eval_amount)

    action_space: str = 'multi_discrete'

    timed_thresholds: Optional[Sequence[Tuple[int, float]]] = None

    config: Dict[str, Any] = dict()
    if algorithm == 'A2C':
        config.update(a2c.A2C_DEFAULT_CONFIG)

        config["rollout_fragment_length"] = 20
        config["use_gae"] = False
        config['vf_loss_coeff'] = .25
        config["lr"] = 0.01
        config['model']['fcnet_hiddens'] = [1024, 512, 256]
        config['min_iter_time_s'] = 20
        timed_thresholds = [(int(1e4), -4.5), (int(4e4), -2), (int(6e4), -1),
                            (int(8e4), 0), (int(1e5), .5), (int(1.2e5), .6),
                            (int(1.5e5), .7), (int(2.5e5), .9)]
    elif algorithm == 'APEX':
        config.update(apex_dqn.APEX_DEFAULT_CONFIG)
        action_space = 'discrete'
    elif algorithm == 'PPO':
        config.update(ppo.DEFAULT_CONFIG)
        config["lr"] = 1e-5
        config['entropy_coeff'] = 0.01
        config['clip_param'] = .3
        config['model']['fcnet_hiddens'] = [1024, 512, 256]
        config["rollout_fragment_length"] = 10
    elif algorithm == 'DQN':
        config.update(dqn.DEFAULT_CONFIG)

        config['hiddens'] = tune.grid_search([[256], [256, 256]])
        config['grad_clip'] = tune.grid_search([.5, 40])
        action_space = 'discrete'
    elif algorithm == 'RAINBOW':
        algorithm = 'DQN'
        config['n_step'] = tune.grid_search([2, 5, 10])
        config['noisy'] = True
        config['num_atoms'] = tune.grid_search([2, 5, 10])
        config['v_min'] = -5.0
        config['v_max'] = 1.0
        # config["sigma0"] = tune.grid_search([])

        config['hiddens'] = tune.grid_search([[256], [256, 256]])
        config['grad_clip'] = tune.grid_search([.5, 40])
        action_space = 'discrete'

    baselines_mutators = [
        sizes_mult_mutator(8),
        # rand_by_x_percent_mutator(0.01),
        # switch_2_protocols_mutation,
        # shuffle_protocols_mutation
    ]
    inplace_mutations = [True] + [False] * (len(baselines_mutators) - 1)

    env_params = dict(action_space=action_space,
                      std_coef=None,
                      use_random_io_mask=True,
                      baselines_mutators=baselines_mutators,
                      inplace_mutations=inplace_mutations,
                      all_protos=all_protos)

    config.update({
        "env": RayNonLearningNetworkIoEnv,
        "env_config": env_config(baseline_datas, **env_params),
        "framework": "torch",
        "num_gpus": 0,
        "num_envs_per_worker": 4,
        'num_workers': 2
    })

    if eval_baselines:
        config["evaluation_config"]["env_config"] = env_config(
            eval_baselines, **env_params)
        config["evaluation_num_episodes"] = 100
        config["evaluation_interval"] = 5

    lrs = sorted(c * 10**-i for i, c in itertools.product(range(2, 6), [1, 5]))
    # config["lr"] = tune.grid_search(lrs)
    # config['lr_schedule'] = tune.grid_search(
    #     [double_middle_drop_lr_sched(lr, stop_timesteps) for lr in lrs] + [[(0, lr)] for lr in lrs]
    # )

    config['lr_schedule'] = double_middle_drop_lr_sched(
        config["lr"], stop_timesteps)
    # config['lr_schedule'] = tune.grid_search([None, config['lr_schedule']])

    rand_seeds = [
        565853, 2104103493, 1593411166, 2062870887, 606544299, 1297392272,
        894118955, 759209631, 1951613876, 571931913, 1991302785, 316008064,
        1894618127, 234605346, 1972456995, 1899998980, 1288798130, 1915494248,
        1112988205, 311173854, 1631390566, 910695991, 1991670774, 1725533340,
        1743250890, 1466896085, 322769861, 1922245188, 962566318, 446335427,
        1071978696, 1202354470, 1770330545, 788227602, 104452329, 1431251508,
        1898474473, 2145189166, 1469515549, 1517824005, 1642986198, 1516401273,
        1853918493, 136233403, 1289467510, 1089288981, 1736900138, 2081164597,
        1342690882, 9569014
    ]
    config['seed'] = seed
    config['seed'] = tune.grid_search(rand_seeds)

    stop = {
        "training_iteration": stop_iters,
        "timesteps_total": stop_timesteps,
        # "episode_reward_mean": stop_reward,
    }

    stop = Stopper(
        stop_timesteps=stop_timesteps,
        stop_iters=stop_iters,
        # stop_reward=stop_reward,
        max_episodes_without_improvement=15,
        timed_thresholds=timed_thresholds)

    results = tune.run(algorithm,
                       config=config,
                       stop=stop,
                       num_samples=num_samples,
                       reuse_actors=True)

    if as_test:
        check_learning_achieved(results, stop_reward)

    ray.shutdown()