# Deep Deterministic Policy Gradient (DDPG)
  print("Training algorithm: Deep Deterministic Policy Gradient (DDPG)")

  trainer = DDPGTrainer(
              env=env_title,
              config={
                "num_workers": num_workers,
                "num_cpus_per_worker": num_cpus_per_worker,
                "num_gpus": num_gpus,
                "num_gpus_per_worker": num_gpus_per_worker,
                "model": nw_model,
                "lr": lr,
                "gamma": gamma,
                "actor_hiddens": [hidden_layer_size, hidden_layer_size],
                "critic_hiddens": [hidden_layer_size, hidden_layer_size],
                "multiagent": {
                  "policy_graphs": policy_graphs,
                  "policy_mapping_fn": policy_mapping_fn,
                  "policies_to_train": ["agent_policy{}".format(i) for i in range(n_agents)],
                },
                "callbacks": {
                  "on_episode_start": tune.function(on_episode_start),
                  "on_episode_step": tune.function(on_episode_step),
                  "on_episode_end": tune.function(on_episode_end),
                },
                "log_level": "ERROR",
              })
             

elif(train_algo == "A3C"):
  
Exemplo n.º 2
0
from ray.rllib.utils import merge_dicts

APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
    DDPG_CONFIG,  # see also the options in ddpg.py, which are also supported
    {
        "optimizer": merge_dicts(
            DDPG_CONFIG["optimizer"], {
                "max_weight_sync_delay": 400,
                "num_replay_buffer_shards": 4,
                "debug": False
            }),
        "n_step": 3,
        "num_gpus": 0,
        "num_workers": 32,
        "buffer_size": 2000000,
        "learning_starts": 50000,
        "train_batch_size": 512,
        "sample_batch_size": 50,
        "target_network_update_freq": 500000,
        "timesteps_per_iteration": 25000,
        "per_worker_exploration": True,
        "worker_side_prioritization": True,
        "min_iter_time_s": 30,
    },
)

ApexDDPGTrainer = DDPGTrainer.with_updates(
    name="APEX_DDPG",
    default_config=APEX_DDPG_DEFAULT_CONFIG,
    **APEX_TRAINER_PROPERTIES)
Exemplo n.º 3
0
from ray.rllib.agents.dqn.apex import apex_execution_plan
from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \
    DEFAULT_CONFIG as DDPG_CONFIG

APEX_DDPG_DEFAULT_CONFIG = DDPGTrainer.merge_trainer_configs(
    DDPG_CONFIG,  # see also the options in ddpg.py, which are also supported
    {
        "optimizer": {
            "max_weight_sync_delay": 400,
            "num_replay_buffer_shards": 4,
            "debug": False
        },
        "exploration_config": {
            "type": "PerWorkerOrnsteinUhlenbeckNoise"
        },
        "n_step": 3,
        "num_gpus": 0,
        "num_workers": 32,
        "buffer_size": 2000000,
        "learning_starts": 50000,
        "train_batch_size": 512,
        "rollout_fragment_length": 50,
        "target_network_update_freq": 500000,
        "timesteps_per_iteration": 25000,
        "worker_side_prioritization": True,
        "min_iter_time_s": 30,
    },
)

ApexDDPGTrainer = DDPGTrainer.with_updates(
    name="APEX_DDPG",
    default_config=APEX_DDPG_DEFAULT_CONFIG,
Exemplo n.º 4
0
Arquivo: td3.py Projeto: ray1201/ray-1
        # no Huber loss, etc.
        "exploration_should_anneal": False,
        "exploration_noise_type": "gaussian",
        "exploration_gaussian_sigma": 0.1,
        "learning_starts": 10000,
        "pure_exploration_steps": 10000,
        "actor_hiddens": [400, 300],
        "critic_hiddens": [400, 300],
        "n_step": 1,
        "gamma": 0.99,
        "actor_lr": 1e-3,
        "critic_lr": 1e-3,
        "l2_reg": 0.0,
        "tau": 5e-3,
        "train_batch_size": 100,
        "use_huber": False,
        "target_network_update_freq": 0,
        "num_workers": 0,
        "num_gpus_per_worker": 0,
        "per_worker_exploration": False,
        "worker_side_prioritization": False,
        "buffer_size": 1000000,
        "prioritized_replay": False,
        "clip_rewards": False,
        "use_state_preprocessor": False,
    },
)

TD3Trainer = DDPGTrainer.with_updates(name="TD3",
                                      default_config=TD3_DEFAULT_CONFIG)
Exemplo n.º 5
0
TD3_DEFAULT_CONFIG = DDPGTrainer.merge_trainer_configs(
    DDPG_CONFIG,
    {
        # largest changes: twin Q functions, delayed policy updates, and target
        # smoothing
        "twin_q": True,
        "policy_delay": 2,
        "smooth_target_policy": True,
        "target_noise": 0.2,
        "target_noise_clip": 0.5,
        "exploration_config": {
            # TD3 uses simple Gaussian noise on top of deterministic NN-output
            # actions (after a possible pure random phase of n timesteps).
            "type": "GaussianNoise",
            # For how many timesteps should we return completely random
            # actions, before we start adding (scaled) noise?
            "random_timesteps": 10000,
            # Gaussian stddev of action noise for exploration.
            "stddev": 0.1,
            # Scaling settings by which the Gaussian noise is scaled before
            # being added to the actions. NOTE: The scale timesteps start only
            # after(!) any random steps have been finished.
            # By default, do not anneal over time (fixed 1.0).
            "initial_scale": 1.0,
            "final_scale": 1.0,
            "scale_timesteps": 1
        },

        # other changes & things we want to keep fixed:
        # larger actor learning rate, no l2 regularisation, no Huber loss, etc.
        "learning_starts": 10000,
        "actor_hiddens": [400, 300],
        "critic_hiddens": [400, 300],
        "n_step": 1,
        "gamma": 0.99,
        "actor_lr": 1e-3,
        "critic_lr": 1e-3,
        "l2_reg": 0.0,
        "tau": 5e-3,
        "train_batch_size": 100,
        "use_huber": False,
        "target_network_update_freq": 0,
        "num_workers": 0,
        "num_gpus_per_worker": 0,
        "worker_side_prioritization": False,
        "buffer_size": 1000000,
        "prioritized_replay": False,
        "clip_rewards": False,
        "use_state_preprocessor": False,
    })
Exemplo n.º 6
0
            net_file=
            '/home/sonic/Desktop/sumo-rl-research-offset/sumo-rl-research/experiments/nets/Research/case04/intersection.net.xml',
            route_file=
            '/home/sonic/Desktop/sumo-rl-research-offset/sumo-rl-research/experiments/nets/Research/case04/intersection.rou.xml',
            out_csv_path='outputs/case04/',
            out_csv_name='DDPG',
            use_gui=False,
            num_seconds=12240612,
            time_to_load_vehicles=612,
            max_depart_delay=0))

    trainer = DDPGTrainer(
        env="2TLS",
        config={
            "multiagent": {
                "policy_graphs": {
                    'offset_agent': (DDPGTFPolicy,
                                     spaces.Box(low=np.zeros(2),
                                                high=np.array(['inf'] * 2)),
                                     spaces.Box(low=np.array([0, 0]),
                                                high=np.array([+1, +1])), {})
                },
                "policy_mapping_fn":
                policy_mapping  # Traffic lights are always controlled by this policy
            },
            "lr": 0.0001,
        })

    while True:
        result = trainer.train()
# /home/sonic/Desktop/sumo-rl-research-offset/sumo-rl-research/experiments/
Exemplo n.º 7
0
APEX_DDPG_DEFAULT_CONFIG = DDPGTrainer.merge_trainer_configs(
    DDPGConfig().to_dict(
    ),  # see also the options in ddpg.py, which are also supported
    {
        "optimizer": {
            "max_weight_sync_delay": 400,
            "num_replay_buffer_shards": 4,
            "debug": False,
        },
        "exploration_config": {
            "type": "PerWorkerOrnsteinUhlenbeckNoise"
        },
        "n_step": 3,
        "num_gpus": 0,
        "num_workers": 32,
        "replay_buffer_config": {
            "capacity": 2000000,
            "no_local_replay_buffer": True,
            # Specify prioritized replay by supplying a buffer type that supports
            # prioritization, for example: MultiAgentPrioritizedReplayBuffer.
            "prioritized_replay": DEPRECATED_VALUE,
            "learning_starts": 50000,
            # Whether all shards of the replay buffer must be co-located
            # with the learner process (running the execution plan).
            # This is preferred b/c the learner process should have quick
            # access to the data from the buffer shards, avoiding network
            # traffic each time samples from the buffer(s) are drawn.
            # Set this to False for relaxing this constraint and allowing
            # replay shards to be created on node(s) other than the one
            # on which the learner is located.
            "replay_buffer_shards_colocated_with_driver": True,
            "worker_side_prioritization": True,
        },
        "train_batch_size": 512,
        "rollout_fragment_length": 50,
        # Update the target network every `target_network_update_freq` sample timesteps.
        "target_network_update_freq": 500000,
        "min_sample_timesteps_per_reporting": 25000,
        "min_time_s_per_reporting": 30,
    },
    _allow_unknown_configs=True,
)
Exemplo n.º 8
0
                                      *,
                                      explore=True,
                                      is_training=False,
                                      **kwargs):
    model_out, _ = model({
        "obs": obs_batch,
        "is_training": is_training,
    }, [], None)
    dist_inputs = model.get_policy_output(model_out)
    dist_class, logit_dim = ModelCatalog.get_action_dist(
                    model.action_space, policy.config["model"], framework="torch")
    return dist_inputs, dist_class, []  # []=state out

PADDPGTorchPolicy = DDPGTorchPolicy.with_updates(
    # loss_fn=paddpg_loss,
    # action_sampler_fn=action_sampler_fn,
    validate_spaces=None,
    action_distribution_fn=get_distribution_inputs_and_class,
    make_model_and_action_dist=None,
    make_model=build_paddpg_models,
)

def get_policy_class(config):
    return PADDPGTorchPolicy

PADDPGTrainer = DDPGTrainer.with_updates(
    name="PADDPG",
    default_config=DDPG_CONFIG,
    get_policy_class=get_policy_class,
    default_policy=PADDPGTorchPolicy,
)
Exemplo n.º 9
0
APEX_DDPG_DEFAULT_CONFIG = DDPGTrainer.merge_trainer_configs(
    DDPG_CONFIG,  # see also the options in ddpg.py, which are also supported
    {
        "optimizer": {
            "max_weight_sync_delay": 400,
            "num_replay_buffer_shards": 4,
            "debug": False
        },
        "exploration_config": {
            "type": "PerWorkerOrnsteinUhlenbeckNoise"
        },
        "n_step": 3,
        "num_gpus": 0,
        "num_workers": 32,
        "buffer_size": 2000000,
        # TODO(jungong) : update once Apex supports replay_buffer_config.
        "replay_buffer_config": None,
        # Whether all shards of the replay buffer must be co-located
        # with the learner process (running the execution plan).
        # This is preferred b/c the learner process should have quick
        # access to the data from the buffer shards, avoiding network
        # traffic each time samples from the buffer(s) are drawn.
        # Set this to False for relaxing this constraint and allowing
        # replay shards to be created on node(s) other than the one
        # on which the learner is located.
        "replay_buffer_shards_colocated_with_driver": True,
        "learning_starts": 50000,
        "train_batch_size": 512,
        "rollout_fragment_length": 50,
        "target_network_update_freq": 500000,
        "timesteps_per_iteration": 25000,
        "worker_side_prioritization": True,
        "min_iter_time_s": 30,
    },
    _allow_unknown_configs=True,
)