buffer_size=config["buffer_size"], train_batch_size=config["train_batch_size"], sample_batch_size=config["sample_batch_size"], **extra_config) workers.add_workers(config["num_workers"]) opt._set_workers(workers.remote_workers()) return opt def update_target_based_on_num_steps_trained(trainer, fetches): # Ape-X updates based on num steps trained, not sampled if (trainer.optimizer.num_steps_trained - trainer.state["last_target_update_ts"] > trainer.config["target_network_update_freq"]): trainer.workers.local_worker().foreach_trainable_policy( lambda p, _: p.update_target()) trainer.state["last_target_update_ts"] = ( trainer.optimizer.num_steps_trained) trainer.state["num_target_updates"] += 1 APEX_TRAINER_PROPERTIES = { "make_workers": defer_make_workers, "make_policy_optimizer": make_async_optimizer, "after_optimizer_step": update_target_based_on_num_steps_trained, } ApexTrainer = DQNTrainer.with_updates(name="APEX", default_config=APEX_DEFAULT_CONFIG, **APEX_TRAINER_PROPERTIES)
merged_op = Concurrently( [store_op, replay_op, update_op], mode="async", output_indexes=[2]) # Add in extra replay and learner metrics to the training result. def add_apex_metrics(result: dict) -> dict: replay_stats = ray.get(replay_actors[0].stats.remote( config["optimizer"].get("debug"))) exploration_infos = workers.foreach_trainable_policy( lambda p, _: p.get_exploration_info()) result["info"].update({ "exploration_infos": exploration_infos, "learner_queue": learner_thread.learner_queue_size.stats(), "learner": copy.deepcopy(learner_thread.stats), "replay_shard_0": replay_stats, }) return result # Only report metrics from the workers with the lowest 1/3 of epsilons. selected_workers = workers.remote_workers()[ -len(workers.remote_workers()) // 3:] return StandardMetricsReporting( merged_op, workers, config, selected_workers=selected_workers).for_each(add_apex_metrics) ApexTrainer = DQNTrainer.with_updates( name="APEX", default_config=APEX_DEFAULT_CONFIG, execution_plan=apex_execution_plan)
if config["simple_optimizer"]: train_step_op = TrainOneStep(workers) else: train_step_op = MultiGPUTrainOneStep( workers=workers, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) # (2) Read and train on experiences from the replay buffer. replay_op = Replay(local_buffer=local_replay_buffer) \ .for_each(train_step_op) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"])) # Alternate deterministically between (1) and (2). train_op = Concurrently([store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config) SimpleQTrainer = DQNTrainer.with_updates(default_policy=SimpleQTFPolicy, get_policy_class=get_policy_class, execution_plan=execution_plan, default_config=DEFAULT_CONFIG)
"multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["ppo_policy"], }, "explore": False, # disable filters, otherwise we would need to synchronize those # as well to the DQN agent "observation_filter": "NoFilter", }) dqn_trainer = DQNTrainer(env="multi_cartpole", config={ "multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["dqn_policy"], }, "gamma": 0.95, "n_step": 3, }) # You should see both the printed X and Y approach 200 as this trains: # info: # policy_reward_mean: # dqn_policy: X # ppo_policy: Y for i in range(args.num_iters): print("== Iteration", i, "==") # improve the DQN policy print("-- DQN --")
def main(argv): ModelCatalog.register_custom_model("my_model", MyModelClass) model = { # cusom model options "custom_model": "my_model", "custom_preprocessor": None, # Extra options to pass to the custom classes "custom_options": {}, # built in options # Number of hidden layers for fully connected net "fcnet_hiddens": [256, 256, 256, 256], } num_workers = 2 # read out command line arguments try: opts, args = getopt.getopt(argv, "hn:", ["number-worker="]) except getopt.GetoptError: print('ray_server.py -n <number-worker>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('ray_server.py -n <number-worker>') print('-n --number-worker - number of worker to start') sys.exit() elif opt in ("-n", "--number-worker"): num_workers = int(arg) ray.init() print("[RAY] Initialized") register_env("srv", lambda _: CartpoleServing()) if ALGORITHM == "APEX": dqn = ApexTrainer( env="srv", config={ # model "model": model, "gamma": 0.99, "noisy": False, "num_gpus": 1, # evaluation # everything default, see dqn.py #exploration "target_network_update_freq": 500000, # rest: everything default, see dqn.py #replay buffer # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. default 50000 "buffer_size": 2000000, # If True prioritized replay buffer will be used. "prioritized_replay": True, # here are many parameters, untouched from me (see dqn.py) # Optimization # Learning rate - defaults to 5e-4 "lr": 0.0001, # Size of rollout batch # Default sample batch size (unroll length). Batches of this size are # collected from workers until train_batch_size is met. When using # multiple envs per worker, this is multiplied by num_envs_per_worker. "sample_batch_size": 4, # Training batch size, if applicable. Should be >= sample_batch_size. # Samples batches will be concatenated together to this size for training. "train_batch_size": 64, # How many steps of the model to sample before learning starts "learning_starts": 50000, #parallelism "num_workers": num_workers, # distribute epsilon over workers (default for apex) "per_worker_exploration": True, # determine per worker which experience should be prioritized, before giving those to the # shared experience memory "worker_side_prioritization": True, # "schedule_max_timesteps": 100000, # was tut es? # "timesteps_per_iteration": 25000, # was tut es? # "min_iter_time_s": 30, # was tut es? }) else: dqn = DQNTrainer( env="srv", config={ # model # mehrere Threads fuer worker! fuer debugging auf false setzen # "sample_async": True, # "grad_clip": 0.5, "model": model, "gamma": 0.99, "noisy": False, "num_gpus": 1, # Whether to use dueling dqn "dueling": False, # Whether to use double dqn "double_q": False, # evaluation # everything default, see dqn.py # exploration "target_network_update_freq": 500000, # rest: everything default, see dqn.py # replay buffer # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. default 50000 "buffer_size": 2000000, # If True prioritized replay buffer will be used. "prioritized_replay": False, # here are many parameters, untouched from me (see dqn.py) # Optimization # Learning rate - defaults to 5e-4 "lr": 0.0001, # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. #"sample_batch_size": 1024, # How many steps of the model to sample before learning starts "learning_starts": 50000, # Size of a batched sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. (Minibatch size) hould be >= sample_batch_size # Samples batches will be concatenated together to this size for training. "train_batch_size": 2048, # parallelism # Number of workers for collecting samples with. This only makes sense # to increase if your environment is particularly slow to sample, or if # you"re using the Async or Ape-X optimizers. "num_workers": num_workers, # distribute epsilon over workers "per_worker_exploration": True, # compute worker side prioritazation (False, because in DQN this was not ipmlemented) "worker_side_prioritization": False, }) # write policy graph to tensorboard (for debugging purposes) policy_graph = dqn.local_evaluator.policy_map["default_policy"].sess.graph writer = tf.summary.FileWriter(dqn._result_logger.logdir, policy_graph) writer.close() # Attempt to restore from checkpoint if possible. if os.path.exists(CHECKPOINT_FILE): checkpoint_path = open(CHECKPOINT_FILE).read() print("Restoring from checkpoint path", checkpoint_path) dqn.restore(checkpoint_path) # Serving and training loop while True: print(pretty_print(dqn.train())) checkpoint_path = dqn.save() print("Last checkpoint", checkpoint_path) with open(CHECKPOINT_FILE, "w") as f: f.write(checkpoint_path)
ray.init() # Register our custom SimpleServing environment as a known environment # with name "srv". register_env("srv", lambda config: SimpleServing(config)) if args.run == "DQN": agent = DQNTrainer( env="srv", config={ # Use a single process to avoid needing a load balancer "num_workers": 0, # Configure the agent to run short iterations for debugging #"exploration_fraction": 0.01, "learning_starts": 100, "timesteps_per_iteration": 200, "env_config": { # Use the connector server to generate experiences. "input": ( lambda ioctx: PolicyServerInput(ioctx, SERVER_ADDRESS, SERVER_PORT) ), "observation_size": args.observation_size, "action_size": args.action_size, }, }) elif args.run == "PG": agent = PGTrainer( env="srv", config={ "num_workers": 0, "env_config": { # Use the connector server to generate experiences.
config["num_workers"] = 18 config["num_gpus"] = 2 config["n_step"] = 3 config["buffer_size"] = 2000000 config["n_step"] = 3 config["learning_starts"] = 50000 config["train_batch_size"] = 512 config["timesteps_per_iteration"] = 25000 config["target_network_update_freq"] = 500000 config["exploration_config"] = {"type": "PerWorkerEpsilonGreedy"} config["worker_side_prioritization"] = True # config["min_iter_time_s"] = 30 # config["training_intensity"] = None # config["log_level"] = 'DEBUG' config["env_config"] = env_config trainer = DQNTrainer(config=config, env=SSA_Tasker_Env) # Can optionally call trainer.restore(path) to load a checkpoint. checkpoints = [] result = {'timesteps_total': 0} i = 0 while result['timesteps_total'] < 1e7: # Perform one iteration of training the policy with PPO result = trainer.train() print(pretty_print(result)) if result['training_iteration'] % 4 == 0: checkpoint = trainer.save() print("checkpoint saved at", checkpoint) checkpoints.append(copy(checkpoint))
learner_thread.learner_queue_size.stats(), "learner": copy.deepcopy(learner_thread.stats), "replay_shard_0": replay_stats, }) return result # Only report metrics from the workers with the lowest 1/3 of epsilons. selected_workers = workers.remote_workers( )[-len(workers.remote_workers()) // 3:] return StandardMetricsReporting( merged_op, workers, config, selected_workers=selected_workers).for_each(add_apex_metrics) def apex_validate_config(config): if config["num_gpus"] > 1: raise ValueError("`num_gpus` > 1 not yet supported for APEX-DQN!") validate_config(config) ApexTrainer = DQNTrainer.with_updates( name="APEX", default_config=APEX_DEFAULT_CONFIG, validate_config=apex_validate_config, execution_plan=apex_execution_plan, mixins=[OverrideDefaultResourceRequest], )
# Path(__file__).parent / "../dataset/intersection_4lane_sv_up" # Path(__file__).parent / "../dataset_public/mixed_loop/its_merge_a" # ).resolve(), ( # Path(__file__).parent / "../dataset/intersection_4lane_sv_right" # Path(__file__).parent / "../dataset_public/mixed_loop/roundabout_its_a" # ).resolve(), ( # Path(__file__).parent / "../dataset_public/mixed_loop/roundabout_merge_a" Path(__file__).parent / "../dataset/simple" ).resolve()] print(f"training on {scenario_paths}") from ray.rllib.agents.trainer_template import build_trainer from ray.rllib.agents.dqn.dqn import DEFAULT_CONFIG, DQNTrainer, validate_config, execution_plan, get_policy_class config = DEFAULT_CONFIG.copy() # config["seed_global"] = 0 DQN = DQNTrainer.with_updates( name="DQN_TORCH", default_policy=DQNTorchPolicy, default_config=DEFAULT_CONFIG, get_policy_class=None) def parse_args(): parser = argparse.ArgumentParser("train on multi scenarios") # env setting parser.add_argument("--scenario", type=str, default=None, help="Scenario name") parser.add_argument("--exper", type=str, default="multi_scenarios") parser.add_argument( "--headless", default=False, action="store_true", help="Turn on headless mode" ) parser.add_argument("--num_workers", type=int, default=1, help="rllib num workers") parser.add_argument( "--horizon", type=int, default=1000, help="horizon for a episode"
def policy_mapping_fn(agent_id): # if agent_id % 2 == 0: # return "ppo_policy" # else: # return "dqn_policy" return agent_id dqn_trainer = DQNTrainer(env="cityflow_multi", config={ "multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": [id_ for id_ in intersection_id] }, "gamma": 0.95, "n_step": 3, "num_workers": 1, "num_cpus_per_worker": 20, "env_config": config }) for i in range(args.epoch): print("== Iteration", i, "==") # improve the DQN policy print("-- DQN --") print(pretty_print(dqn_trainer.train()))
max_depart_delay=0)) trainer = DQNTrainer( env="2TLS", config={ "multiagent": { "policy_graphs": { '3210041371': (DQNTFPolicy, spaces.Box(low=np.zeros(16), high=np.array(['inf'] * 16)), spaces.Discrete(2), {}), '452397025': (DQNTFPolicy, spaces.Box(low=np.zeros(14), high=np.array(['inf'] * 14)), spaces.Discrete(2), {}), '4708662059': (DQNTFPolicy, spaces.Box(low=np.zeros(19), high=np.array(['inf'] * 19)), spaces.Discrete(2), {}), '5870232715': (DQNTFPolicy, spaces.Box(low=np.zeros(10), high=np.array(['inf'] * 10)), spaces.Discrete(2), {}) }, "policy_mapping_fn": policy_mapping # Traffic lights are always controlled by this policy }, "lr": 0.0001, }) while True:
def get_trainer_from_params(params): return DQNTrainer(env="melee", config=params['rllib_params'])
ray.init(num_gpus=1, log_to_driver=False, local_mode=True, ignore_reinit_error=True) ModelCatalog.register_custom_model("keras_q_model", DQNModel) qTrainer = DQNTrainer( env=KGRLEnv, config={ # config to pass to env class "model": { "custom_model": "keras_q_model" }, "seed": seed, "env_config": { "training": True, "idx_to_test": None, "train_data": train_data, "test_data": test_data, "pred_train": pred_train, "pred_test": pred_test, "do_bert": do_bert }, "buffer_size": 100, "lr_schedule": [[0, 0.05], [20, 0.01], [30, 0.005], [50, 0.001]], "train_batch_size": 100 }) prev_time = time.time() for i in range(total_iteration): print("iteration {};".format(i), \ "%d sec/iteration;" % (time.time()- prev_time), \ "%d min remaining" % ((total_iteration - i)*(time.time()- prev_time)/60))
route_file='nets/Research/case03/test.rou.xml', out_csv_path='outputs/grad/', out_csv_name='nonrl', use_gui=True, num_seconds=22000, time_to_load_vehicles=21600, max_depart_delay=0)) trainer = DQNTrainer( env="2TLS", config={ "multiagent": { "policy_graphs": { 'left': (DQNTFPolicy, spaces.Box(low=np.zeros(21), high=np.ones(21)), spaces.Discrete(2), {}), 'right': (DQNTFPolicy, spaces.Box(low=np.zeros(21), high=np.ones(21)), spaces.Discrete(2), {}) }, "policy_mapping_fn": policy_mapping # Traffic lights are always controlled by this policy }, "lr": 0.0001, }) while True: result = trainer.train()
"2TLS", lambda _: SumoEnvironment( net_file= '/home/sonic/Desktop/sumo-rl-research-offset/sumo-rl-research/experiments/nets/Research/case04/intersection.net.xml', route_file= '/home/sonic/Desktop/sumo-rl-research-offset/sumo-rl-research/experiments/nets/Research/case04/intersection.rou.xml', out_csv_path='outputs/case04/', out_csv_name='DQN_3', use_gui=True, num_seconds=15300510, time_to_load_vehicles=510, max_depart_delay=0)) trainer = DQNTrainer( env="2TLS", config={ "multiagent": { "policy_graphs": { 'offset_agent': (DQNTFPolicy, spaces.Box(low=np.zeros(15), high=np.array(['inf'] * 15)), spaces.MultiDiscrete([102, 102]), {}) }, "policy_mapping_fn": policy_mapping # Traffic lights are always controlled by this policy }, "lr": 0.0001, }) while True: result = trainer.train() # /home/sonic/Desktop/sumo-rl-research-offset/sumo-rl-research/experiments/