Python DQNTrainer 예제들, ray.rllib.agents.dqn.dqn.DQNTrainer Python 예제들

예제 #1

0

파일 보기

                               buffer_size=config["buffer_size"],
                               train_batch_size=config["train_batch_size"],
                               sample_batch_size=config["sample_batch_size"],
                               **extra_config)
    workers.add_workers(config["num_workers"])
    opt._set_workers(workers.remote_workers())
    return opt


def update_target_based_on_num_steps_trained(trainer, fetches):
    # Ape-X updates based on num steps trained, not sampled
    if (trainer.optimizer.num_steps_trained -
            trainer.state["last_target_update_ts"] >
            trainer.config["target_network_update_freq"]):
        trainer.workers.local_worker().foreach_trainable_policy(
            lambda p, _: p.update_target())
        trainer.state["last_target_update_ts"] = (
            trainer.optimizer.num_steps_trained)
        trainer.state["num_target_updates"] += 1


APEX_TRAINER_PROPERTIES = {
    "make_workers": defer_make_workers,
    "make_policy_optimizer": make_async_optimizer,
    "after_optimizer_step": update_target_based_on_num_steps_trained,
}

ApexTrainer = DQNTrainer.with_updates(name="APEX",
                                      default_config=APEX_DEFAULT_CONFIG,
                                      **APEX_TRAINER_PROPERTIES)

예제 #2

0

파일 보기

        merged_op = Concurrently(
            [store_op, replay_op, update_op], mode="async", output_indexes=[2])

    # Add in extra replay and learner metrics to the training result.
    def add_apex_metrics(result: dict) -> dict:
        replay_stats = ray.get(replay_actors[0].stats.remote(
            config["optimizer"].get("debug")))
        exploration_infos = workers.foreach_trainable_policy(
            lambda p, _: p.get_exploration_info())
        result["info"].update({
            "exploration_infos": exploration_infos,
            "learner_queue": learner_thread.learner_queue_size.stats(),
            "learner": copy.deepcopy(learner_thread.stats),
            "replay_shard_0": replay_stats,
        })
        return result

    # Only report metrics from the workers with the lowest 1/3 of epsilons.
    selected_workers = workers.remote_workers()[
        -len(workers.remote_workers()) // 3:]

    return StandardMetricsReporting(
        merged_op, workers, config,
        selected_workers=selected_workers).for_each(add_apex_metrics)


ApexTrainer = DQNTrainer.with_updates(
    name="APEX",
    default_config=APEX_DEFAULT_CONFIG,
    execution_plan=apex_execution_plan)

예제 #3

0

파일 보기

파일: simple_q.py 프로젝트: nikitavemuri/ray

    if config["simple_optimizer"]:
        train_step_op = TrainOneStep(workers)
    else:
        train_step_op = MultiGPUTrainOneStep(
            workers=workers,
            sgd_minibatch_size=config["train_batch_size"],
            num_sgd_iter=1,
            num_gpus=config["num_gpus"],
            shuffle_sequences=True,
            _fake_gpus=config["_fake_gpus"],
            framework=config.get("framework"))

    # (2) Read and train on experiences from the replay buffer.
    replay_op = Replay(local_buffer=local_replay_buffer) \
        .for_each(train_step_op) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    # Alternate deterministically between (1) and (2).
    train_op = Concurrently([store_op, replay_op],
                            mode="round_robin",
                            output_indexes=[1])

    return StandardMetricsReporting(train_op, workers, config)


SimpleQTrainer = DQNTrainer.with_updates(default_policy=SimpleQTFPolicy,
                                         get_policy_class=get_policy_class,
                                         execution_plan=execution_plan,
                                         default_config=DEFAULT_CONFIG)

예제 #4

0

파일 보기

파일: multi_agent_two_trainers.py 프로젝트: yuishihara/ray

            "multiagent": {
                "policies": policies,
                "policy_mapping_fn": policy_mapping_fn,
                "policies_to_train": ["ppo_policy"],
            },
            "explore": False,
            # disable filters, otherwise we would need to synchronize those
            # as well to the DQN agent
            "observation_filter": "NoFilter",
        })

    dqn_trainer = DQNTrainer(env="multi_cartpole",
                             config={
                                 "multiagent": {
                                     "policies": policies,
                                     "policy_mapping_fn": policy_mapping_fn,
                                     "policies_to_train": ["dqn_policy"],
                                 },
                                 "gamma": 0.95,
                                 "n_step": 3,
                             })

    # You should see both the printed X and Y approach 200 as this trains:
    # info:
    #   policy_reward_mean:
    #     dqn_policy: X
    #     ppo_policy: Y
    for i in range(args.num_iters):
        print("== Iteration", i, "==")

        # improve the DQN policy
        print("-- DQN --")

예제 #5

0

파일 보기

def main(argv):
    ModelCatalog.register_custom_model("my_model", MyModelClass)

    model = {
        # cusom model options
        "custom_model": "my_model",
        "custom_preprocessor": None,
        # Extra options to pass to the custom classes
        "custom_options": {},

        # built in options
        # Number of hidden layers for fully connected net
        "fcnet_hiddens": [256, 256, 256, 256],
    }

    num_workers = 2

    # read out command line arguments
    try:
        opts, args = getopt.getopt(argv, "hn:", ["number-worker="])
    except getopt.GetoptError:
        print('ray_server.py -n <number-worker>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('ray_server.py -n <number-worker>')
            print('-n --number-worker  - number of worker to start')
            sys.exit()
        elif opt in ("-n", "--number-worker"):
            num_workers = int(arg)

    ray.init()
    print("[RAY] Initialized")
    register_env("srv", lambda _: CartpoleServing())

    if ALGORITHM == "APEX":
        dqn = ApexTrainer(
            env="srv",
            config={
                # model
                "model": model,
                "gamma": 0.99,
                "noisy": False,
                "num_gpus": 1,

                # evaluation
                # everything default, see dqn.py

                #exploration
                "target_network_update_freq": 500000,
                # rest: everything default, see dqn.py

                #replay buffer
                # Size of the replay buffer. Note that if async_updates is set, then
                # each worker will have a replay buffer of this size. default 50000
                "buffer_size": 2000000,
                # If True prioritized replay buffer will be used.
                "prioritized_replay": True,
                # here are many parameters, untouched from me (see dqn.py)

                # Optimization
                # Learning rate - defaults to 5e-4
                "lr": 0.0001,
                # Size of rollout batch
                # Default sample batch size (unroll length). Batches of this size are
                # collected from workers until train_batch_size is met. When using
                # multiple envs per worker, this is multiplied by num_envs_per_worker.
                "sample_batch_size": 4,
                # Training batch size, if applicable. Should be >= sample_batch_size.
                # Samples batches will be concatenated together to this size for training.
                "train_batch_size": 64,
                # How many steps of the model to sample before learning starts
                "learning_starts": 50000,

                #parallelism
                "num_workers": num_workers,
                # distribute epsilon over workers (default for apex)
                "per_worker_exploration": True,
                # determine per worker which experience should be prioritized, before giving those to the
                # shared experience memory
                "worker_side_prioritization": True,

                # "schedule_max_timesteps": 100000, # was tut es?
                # "timesteps_per_iteration": 25000, # was tut es?
                # "min_iter_time_s": 30, # was tut es?
            })
    else:
        dqn = DQNTrainer(
            env="srv",
            config={
                # model
                # mehrere Threads fuer worker! fuer debugging auf false setzen
                # "sample_async": True,
                # "grad_clip": 0.5,
                "model": model,
                "gamma": 0.99,
                "noisy": False,
                "num_gpus": 1,

                # Whether to use dueling dqn
                "dueling": False,
                # Whether to use double dqn
                "double_q": False,

                # evaluation
                # everything default, see dqn.py

                # exploration
                "target_network_update_freq": 500000,
                # rest: everything default, see dqn.py

                # replay buffer
                # Size of the replay buffer. Note that if async_updates is set, then
                # each worker will have a replay buffer of this size. default 50000
                "buffer_size": 2000000,
                # If True prioritized replay buffer will be used.
                "prioritized_replay": False,
                # here are many parameters, untouched from me (see dqn.py)

                # Optimization
                # Learning rate - defaults to 5e-4
                "lr": 0.0001,
                # Update the replay buffer with this many samples at once. Note that
                # this setting applies per-worker if num_workers > 1.
                #"sample_batch_size": 1024,
                # How many steps of the model to sample before learning starts
                "learning_starts": 50000,
                # Size of a batched sampled from replay buffer for training. Note that
                # if async_updates is set, then each worker returns gradients for a
                # batch of this size. (Minibatch size) hould be >= sample_batch_size
                # Samples batches will be concatenated together to this size for training.
                "train_batch_size": 2048,

                # parallelism
                # Number of workers for collecting samples with. This only makes sense
                # to increase if your environment is particularly slow to sample, or if
                # you"re using the Async or Ape-X optimizers.
                "num_workers": num_workers,
                # distribute epsilon over workers
                "per_worker_exploration": True,
                # compute worker side prioritazation (False, because in DQN this was not ipmlemented)
                "worker_side_prioritization": False,
            })

    # write policy graph to tensorboard (for debugging purposes)
    policy_graph = dqn.local_evaluator.policy_map["default_policy"].sess.graph
    writer = tf.summary.FileWriter(dqn._result_logger.logdir, policy_graph)
    writer.close()

    # Attempt to restore from checkpoint if possible.
    if os.path.exists(CHECKPOINT_FILE):
        checkpoint_path = open(CHECKPOINT_FILE).read()
        print("Restoring from checkpoint path", checkpoint_path)
        dqn.restore(checkpoint_path)

    # Serving and training loop
    while True:
        print(pretty_print(dqn.train()))
        checkpoint_path = dqn.save()
        print("Last checkpoint", checkpoint_path)
        with open(CHECKPOINT_FILE, "w") as f:
            f.write(checkpoint_path)

예제 #6

0

파일 보기

파일: simple_policy_server.py 프로젝트: zhuohuwu0603/academy

        ray.init()

    # Register our custom SimpleServing environment as a known environment
    # with name "srv".
    register_env("srv", lambda config: SimpleServing(config))

    if args.run == "DQN":
        agent = DQNTrainer(
            env="srv",
            config={
                # Use a single process to avoid needing a load balancer
                "num_workers": 0,
                # Configure the agent to run short iterations for debugging
                #"exploration_fraction": 0.01,
                "learning_starts": 100,
                "timesteps_per_iteration": 200,
                "env_config": {
                    # Use the connector server to generate experiences.
                    "input": (
                        lambda ioctx: PolicyServerInput(ioctx, SERVER_ADDRESS, SERVER_PORT)
                    ),
                    "observation_size": args.observation_size,
                    "action_size": args.action_size,
                },
            })
    elif args.run == "PG":
        agent = PGTrainer(
            env="srv",
            config={
                "num_workers": 0,
                "env_config": {
                    # Use the connector server to generate experiences.

예제 #7

0

파일 보기

config["num_workers"] = 18
config["num_gpus"] = 2
config["n_step"] = 3
config["buffer_size"] = 2000000
config["n_step"] = 3
config["learning_starts"] = 50000
config["train_batch_size"] = 512
config["timesteps_per_iteration"] = 25000
config["target_network_update_freq"] = 500000
config["exploration_config"] = {"type": "PerWorkerEpsilonGreedy"}
config["worker_side_prioritization"] = True
# config["min_iter_time_s"] = 30
# config["training_intensity"] = None
# config["log_level"] = 'DEBUG'
config["env_config"] = env_config
trainer = DQNTrainer(config=config, env=SSA_Tasker_Env)
# Can optionally call trainer.restore(path) to load a checkpoint.

checkpoints = []
result = {'timesteps_total': 0}
i = 0
while result['timesteps_total'] < 1e7:
    # Perform one iteration of training the policy with PPO
    result = trainer.train()
    print(pretty_print(result))

    if result['training_iteration'] % 4 == 0:
        checkpoint = trainer.save()
        print("checkpoint saved at", checkpoint)
        checkpoints.append(copy(checkpoint))

예제 #8

0

파일 보기

            learner_thread.learner_queue_size.stats(),
            "learner":
            copy.deepcopy(learner_thread.stats),
            "replay_shard_0":
            replay_stats,
        })
        return result

    # Only report metrics from the workers with the lowest 1/3 of epsilons.
    selected_workers = workers.remote_workers(
    )[-len(workers.remote_workers()) // 3:]

    return StandardMetricsReporting(
        merged_op, workers, config,
        selected_workers=selected_workers).for_each(add_apex_metrics)


def apex_validate_config(config):
    if config["num_gpus"] > 1:
        raise ValueError("`num_gpus` > 1 not yet supported for APEX-DQN!")
    validate_config(config)


ApexTrainer = DQNTrainer.with_updates(
    name="APEX",
    default_config=APEX_DEFAULT_CONFIG,
    validate_config=apex_validate_config,
    execution_plan=apex_execution_plan,
    mixins=[OverrideDefaultResourceRequest],
)

예제 #9

0

파일 보기

    # Path(__file__).parent / "../dataset/intersection_4lane_sv_up"
#     Path(__file__).parent / "../dataset_public/mixed_loop/its_merge_a"
# ).resolve(), (
#     Path(__file__).parent / "../dataset/intersection_4lane_sv_right"
#     Path(__file__).parent / "../dataset_public/mixed_loop/roundabout_its_a"
# ).resolve(), (
#     Path(__file__).parent / "../dataset_public/mixed_loop/roundabout_merge_a"
    Path(__file__).parent / "../dataset/simple"
).resolve()]
print(f"training on {scenario_paths}")

from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.agents.dqn.dqn import DEFAULT_CONFIG, DQNTrainer, validate_config, execution_plan, get_policy_class
config = DEFAULT_CONFIG.copy()
# config["seed_global"] = 0
DQN = DQNTrainer.with_updates(
    name="DQN_TORCH", default_policy=DQNTorchPolicy, default_config=DEFAULT_CONFIG, get_policy_class=None)

def parse_args():
    parser = argparse.ArgumentParser("train on multi scenarios")

    # env setting
    parser.add_argument("--scenario", type=str, default=None, help="Scenario name")
    parser.add_argument("--exper", type=str, default="multi_scenarios")
    parser.add_argument(
        "--headless", default=False, action="store_true", help="Turn on headless mode"
    )


    parser.add_argument("--num_workers", type=int, default=1, help="rllib num workers")
    parser.add_argument(
        "--horizon", type=int, default=1000, help="horizon for a episode"

예제 #10

0

파일 보기

    def policy_mapping_fn(agent_id):
        # if agent_id % 2 == 0:
        #     return "ppo_policy"
        # else:
        #     return "dqn_policy"
        return agent_id

    dqn_trainer = DQNTrainer(env="cityflow_multi",
                             config={
                                 "multiagent": {
                                     "policies":
                                     policies,
                                     "policy_mapping_fn":
                                     policy_mapping_fn,
                                     "policies_to_train":
                                     [id_ for id_ in intersection_id]
                                 },
                                 "gamma": 0.95,
                                 "n_step": 3,
                                 "num_workers": 1,
                                 "num_cpus_per_worker": 20,
                                 "env_config": config
                             })

    for i in range(args.epoch):
        print("== Iteration", i, "==")

        # improve the DQN policy
        print("-- DQN --")
        print(pretty_print(dqn_trainer.train()))

예제 #11

0

파일 보기

            max_depart_delay=0))

    trainer = DQNTrainer(
        env="2TLS",
        config={
            "multiagent": {
                "policy_graphs": {
                    '3210041371': (DQNTFPolicy,
                                   spaces.Box(low=np.zeros(16),
                                              high=np.array(['inf'] * 16)),
                                   spaces.Discrete(2), {}),
                    '452397025': (DQNTFPolicy,
                                  spaces.Box(low=np.zeros(14),
                                             high=np.array(['inf'] * 14)),
                                  spaces.Discrete(2), {}),
                    '4708662059': (DQNTFPolicy,
                                   spaces.Box(low=np.zeros(19),
                                              high=np.array(['inf'] * 19)),
                                   spaces.Discrete(2), {}),
                    '5870232715': (DQNTFPolicy,
                                   spaces.Box(low=np.zeros(10),
                                              high=np.array(['inf'] * 10)),
                                   spaces.Discrete(2), {})
                },
                "policy_mapping_fn":
                policy_mapping  # Traffic lights are always controlled by this policy
            },
            "lr": 0.0001,
        })

    while True:

예제 #12

0

파일 보기

def get_trainer_from_params(params):
    return DQNTrainer(env="melee", config=params['rllib_params'])

예제 #13

0

파일 보기

파일: main.py 프로젝트: yeliu0930/Knowledge-guided-Open-Attribute-Value-Extraction-with-Reinforcement-Learning

    ray.init(num_gpus=1,
             log_to_driver=False,
             local_mode=True,
             ignore_reinit_error=True)
    ModelCatalog.register_custom_model("keras_q_model", DQNModel)

    qTrainer = DQNTrainer(
        env=KGRLEnv,
        config={  # config to pass to env class
            "model": {
                "custom_model": "keras_q_model"
            },
            "seed": seed,
            "env_config": {
                "training": True,
                "idx_to_test": None,
                "train_data": train_data,
                "test_data": test_data,
                "pred_train": pred_train,
                "pred_test": pred_test,
                "do_bert": do_bert
            },
            "buffer_size": 100,
            "lr_schedule": [[0, 0.05], [20, 0.01], [30, 0.005], [50, 0.001]],
            "train_batch_size": 100
        })

    prev_time = time.time()
    for i in range(total_iteration):
        print("iteration {};".format(i), \
              "%d sec/iteration;" % (time.time()- prev_time), \
              "%d min remaining" % ((total_iteration - i)*(time.time()- prev_time)/60))

예제 #14

0

파일 보기

파일: Non_RL.py 프로젝트: june6723/sumo-rl-offset

            route_file='nets/Research/case03/test.rou.xml',
            out_csv_path='outputs/grad/',
            out_csv_name='nonrl',
            use_gui=True,
            num_seconds=22000,
            time_to_load_vehicles=21600,
            max_depart_delay=0))

    trainer = DQNTrainer(
        env="2TLS",
        config={
            "multiagent": {
                "policy_graphs": {
                    'left':
                    (DQNTFPolicy, spaces.Box(low=np.zeros(21),
                                             high=np.ones(21)),
                     spaces.Discrete(2), {}),
                    'right':
                    (DQNTFPolicy, spaces.Box(low=np.zeros(21),
                                             high=np.ones(21)),
                     spaces.Discrete(2), {})
                },
                "policy_mapping_fn":
                policy_mapping  # Traffic lights are always controlled by this policy
            },
            "lr": 0.0001,
        })

    while True:
        result = trainer.train()

예제 #15

0

파일 보기

        "2TLS", lambda _: SumoEnvironment(
            net_file=
            '/home/sonic/Desktop/sumo-rl-research-offset/sumo-rl-research/experiments/nets/Research/case04/intersection.net.xml',
            route_file=
            '/home/sonic/Desktop/sumo-rl-research-offset/sumo-rl-research/experiments/nets/Research/case04/intersection.rou.xml',
            out_csv_path='outputs/case04/',
            out_csv_name='DQN_3',
            use_gui=True,
            num_seconds=15300510,
            time_to_load_vehicles=510,
            max_depart_delay=0))

    trainer = DQNTrainer(
        env="2TLS",
        config={
            "multiagent": {
                "policy_graphs": {
                    'offset_agent': (DQNTFPolicy,
                                     spaces.Box(low=np.zeros(15),
                                                high=np.array(['inf'] * 15)),
                                     spaces.MultiDiscrete([102, 102]), {})
                },
                "policy_mapping_fn":
                policy_mapping  # Traffic lights are always controlled by this policy
            },
            "lr": 0.0001,
        })

    while True:
        result = trainer.train()
# /home/sonic/Desktop/sumo-rl-research-offset/sumo-rl-research/experiments/