# Deep Deterministic Policy Gradient (DDPG) print("Training algorithm: Deep Deterministic Policy Gradient (DDPG)") trainer = DDPGTrainer( env=env_title, config={ "num_workers": num_workers, "num_cpus_per_worker": num_cpus_per_worker, "num_gpus": num_gpus, "num_gpus_per_worker": num_gpus_per_worker, "model": nw_model, "lr": lr, "gamma": gamma, "actor_hiddens": [hidden_layer_size, hidden_layer_size], "critic_hiddens": [hidden_layer_size, hidden_layer_size], "multiagent": { "policy_graphs": policy_graphs, "policy_mapping_fn": policy_mapping_fn, "policies_to_train": ["agent_policy{}".format(i) for i in range(n_agents)], }, "callbacks": { "on_episode_start": tune.function(on_episode_start), "on_episode_step": tune.function(on_episode_step), "on_episode_end": tune.function(on_episode_end), }, "log_level": "ERROR", }) elif(train_algo == "A3C"):
from ray.rllib.utils import merge_dicts APEX_DDPG_DEFAULT_CONFIG = merge_dicts( DDPG_CONFIG, # see also the options in ddpg.py, which are also supported { "optimizer": merge_dicts( DDPG_CONFIG["optimizer"], { "max_weight_sync_delay": 400, "num_replay_buffer_shards": 4, "debug": False }), "n_step": 3, "num_gpus": 0, "num_workers": 32, "buffer_size": 2000000, "learning_starts": 50000, "train_batch_size": 512, "sample_batch_size": 50, "target_network_update_freq": 500000, "timesteps_per_iteration": 25000, "per_worker_exploration": True, "worker_side_prioritization": True, "min_iter_time_s": 30, }, ) ApexDDPGTrainer = DDPGTrainer.with_updates( name="APEX_DDPG", default_config=APEX_DDPG_DEFAULT_CONFIG, **APEX_TRAINER_PROPERTIES)
from ray.rllib.agents.dqn.apex import apex_execution_plan from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \ DEFAULT_CONFIG as DDPG_CONFIG APEX_DDPG_DEFAULT_CONFIG = DDPGTrainer.merge_trainer_configs( DDPG_CONFIG, # see also the options in ddpg.py, which are also supported { "optimizer": { "max_weight_sync_delay": 400, "num_replay_buffer_shards": 4, "debug": False }, "exploration_config": { "type": "PerWorkerOrnsteinUhlenbeckNoise" }, "n_step": 3, "num_gpus": 0, "num_workers": 32, "buffer_size": 2000000, "learning_starts": 50000, "train_batch_size": 512, "rollout_fragment_length": 50, "target_network_update_freq": 500000, "timesteps_per_iteration": 25000, "worker_side_prioritization": True, "min_iter_time_s": 30, }, ) ApexDDPGTrainer = DDPGTrainer.with_updates( name="APEX_DDPG", default_config=APEX_DDPG_DEFAULT_CONFIG,
# no Huber loss, etc. "exploration_should_anneal": False, "exploration_noise_type": "gaussian", "exploration_gaussian_sigma": 0.1, "learning_starts": 10000, "pure_exploration_steps": 10000, "actor_hiddens": [400, 300], "critic_hiddens": [400, 300], "n_step": 1, "gamma": 0.99, "actor_lr": 1e-3, "critic_lr": 1e-3, "l2_reg": 0.0, "tau": 5e-3, "train_batch_size": 100, "use_huber": False, "target_network_update_freq": 0, "num_workers": 0, "num_gpus_per_worker": 0, "per_worker_exploration": False, "worker_side_prioritization": False, "buffer_size": 1000000, "prioritized_replay": False, "clip_rewards": False, "use_state_preprocessor": False, }, ) TD3Trainer = DDPGTrainer.with_updates(name="TD3", default_config=TD3_DEFAULT_CONFIG)
TD3_DEFAULT_CONFIG = DDPGTrainer.merge_trainer_configs( DDPG_CONFIG, { # largest changes: twin Q functions, delayed policy updates, and target # smoothing "twin_q": True, "policy_delay": 2, "smooth_target_policy": True, "target_noise": 0.2, "target_noise_clip": 0.5, "exploration_config": { # TD3 uses simple Gaussian noise on top of deterministic NN-output # actions (after a possible pure random phase of n timesteps). "type": "GaussianNoise", # For how many timesteps should we return completely random # actions, before we start adding (scaled) noise? "random_timesteps": 10000, # Gaussian stddev of action noise for exploration. "stddev": 0.1, # Scaling settings by which the Gaussian noise is scaled before # being added to the actions. NOTE: The scale timesteps start only # after(!) any random steps have been finished. # By default, do not anneal over time (fixed 1.0). "initial_scale": 1.0, "final_scale": 1.0, "scale_timesteps": 1 }, # other changes & things we want to keep fixed: # larger actor learning rate, no l2 regularisation, no Huber loss, etc. "learning_starts": 10000, "actor_hiddens": [400, 300], "critic_hiddens": [400, 300], "n_step": 1, "gamma": 0.99, "actor_lr": 1e-3, "critic_lr": 1e-3, "l2_reg": 0.0, "tau": 5e-3, "train_batch_size": 100, "use_huber": False, "target_network_update_freq": 0, "num_workers": 0, "num_gpus_per_worker": 0, "worker_side_prioritization": False, "buffer_size": 1000000, "prioritized_replay": False, "clip_rewards": False, "use_state_preprocessor": False, })
net_file= '/home/sonic/Desktop/sumo-rl-research-offset/sumo-rl-research/experiments/nets/Research/case04/intersection.net.xml', route_file= '/home/sonic/Desktop/sumo-rl-research-offset/sumo-rl-research/experiments/nets/Research/case04/intersection.rou.xml', out_csv_path='outputs/case04/', out_csv_name='DDPG', use_gui=False, num_seconds=12240612, time_to_load_vehicles=612, max_depart_delay=0)) trainer = DDPGTrainer( env="2TLS", config={ "multiagent": { "policy_graphs": { 'offset_agent': (DDPGTFPolicy, spaces.Box(low=np.zeros(2), high=np.array(['inf'] * 2)), spaces.Box(low=np.array([0, 0]), high=np.array([+1, +1])), {}) }, "policy_mapping_fn": policy_mapping # Traffic lights are always controlled by this policy }, "lr": 0.0001, }) while True: result = trainer.train() # /home/sonic/Desktop/sumo-rl-research-offset/sumo-rl-research/experiments/
APEX_DDPG_DEFAULT_CONFIG = DDPGTrainer.merge_trainer_configs( DDPGConfig().to_dict( ), # see also the options in ddpg.py, which are also supported { "optimizer": { "max_weight_sync_delay": 400, "num_replay_buffer_shards": 4, "debug": False, }, "exploration_config": { "type": "PerWorkerOrnsteinUhlenbeckNoise" }, "n_step": 3, "num_gpus": 0, "num_workers": 32, "replay_buffer_config": { "capacity": 2000000, "no_local_replay_buffer": True, # Specify prioritized replay by supplying a buffer type that supports # prioritization, for example: MultiAgentPrioritizedReplayBuffer. "prioritized_replay": DEPRECATED_VALUE, "learning_starts": 50000, # Whether all shards of the replay buffer must be co-located # with the learner process (running the execution plan). # This is preferred b/c the learner process should have quick # access to the data from the buffer shards, avoiding network # traffic each time samples from the buffer(s) are drawn. # Set this to False for relaxing this constraint and allowing # replay shards to be created on node(s) other than the one # on which the learner is located. "replay_buffer_shards_colocated_with_driver": True, "worker_side_prioritization": True, }, "train_batch_size": 512, "rollout_fragment_length": 50, # Update the target network every `target_network_update_freq` sample timesteps. "target_network_update_freq": 500000, "min_sample_timesteps_per_reporting": 25000, "min_time_s_per_reporting": 30, }, _allow_unknown_configs=True, )
*, explore=True, is_training=False, **kwargs): model_out, _ = model({ "obs": obs_batch, "is_training": is_training, }, [], None) dist_inputs = model.get_policy_output(model_out) dist_class, logit_dim = ModelCatalog.get_action_dist( model.action_space, policy.config["model"], framework="torch") return dist_inputs, dist_class, [] # []=state out PADDPGTorchPolicy = DDPGTorchPolicy.with_updates( # loss_fn=paddpg_loss, # action_sampler_fn=action_sampler_fn, validate_spaces=None, action_distribution_fn=get_distribution_inputs_and_class, make_model_and_action_dist=None, make_model=build_paddpg_models, ) def get_policy_class(config): return PADDPGTorchPolicy PADDPGTrainer = DDPGTrainer.with_updates( name="PADDPG", default_config=DDPG_CONFIG, get_policy_class=get_policy_class, default_policy=PADDPGTorchPolicy, )
APEX_DDPG_DEFAULT_CONFIG = DDPGTrainer.merge_trainer_configs( DDPG_CONFIG, # see also the options in ddpg.py, which are also supported { "optimizer": { "max_weight_sync_delay": 400, "num_replay_buffer_shards": 4, "debug": False }, "exploration_config": { "type": "PerWorkerOrnsteinUhlenbeckNoise" }, "n_step": 3, "num_gpus": 0, "num_workers": 32, "buffer_size": 2000000, # TODO(jungong) : update once Apex supports replay_buffer_config. "replay_buffer_config": None, # Whether all shards of the replay buffer must be co-located # with the learner process (running the execution plan). # This is preferred b/c the learner process should have quick # access to the data from the buffer shards, avoiding network # traffic each time samples from the buffer(s) are drawn. # Set this to False for relaxing this constraint and allowing # replay shards to be created on node(s) other than the one # on which the learner is located. "replay_buffer_shards_colocated_with_driver": True, "learning_starts": 50000, "train_batch_size": 512, "rollout_fragment_length": 50, "target_network_update_freq": 500000, "timesteps_per_iteration": 25000, "worker_side_prioritization": True, "min_iter_time_s": 30, }, _allow_unknown_configs=True, )