Exemplo n.º 1
0
Arquivo: ddpg.py Projeto: zqxyz73/ray
            DEPRECATED_VALUE:
        deprecation_warning("exploration_final_scale",
                            "exploration_config.final_scale")
        if isinstance(config["exploration_config"], dict):
            config["exploration_config"]["final_scale"] = \
                config.pop("exploration_final_scale")
    if config.get("exploration_fraction", DEPRECATED_VALUE) != \
            DEPRECATED_VALUE:
        assert schedule_max_timesteps is not None
        deprecation_warning("exploration_fraction",
                            "exploration_config.scale_timesteps")
        if isinstance(config["exploration_config"], dict):
            config["exploration_config"]["scale_timesteps"] = config.pop(
                "exploration_fraction") * schedule_max_timesteps
    if config.get("per_worker_exploration", DEPRECATED_VALUE) != \
            DEPRECATED_VALUE:
        deprecation_warning(
            "per_worker_exploration",
            "exploration_config.type=PerWorkerOrnsteinUhlenbeckNoise")
        if isinstance(config["exploration_config"], dict):
            config["exploration_config"]["type"] = \
                PerWorkerOrnsteinUhlenbeckNoise


DDPGTrainer = GenericOffPolicyTrainer.with_updates(
    name="DDPG",
    default_config=DEFAULT_CONFIG,
    default_policy=DDPGTFPolicy,
    validate_config=validate_config,
)
Exemplo n.º 2
0

def execution_plan(workers, config):
    rollouts = ParallelRollouts(workers, mode="bulk_sync")
    replay_buffer = SimpleReplayBuffer(config["buffer_size"])

    store_op = rollouts \
        .for_each(StoreToReplayBuffer(local_buffer=replay_buffer))

    train_op = Replay(local_buffer=replay_buffer) \
        .combine(
            ConcatBatches(min_batch_size=config["train_batch_size"])) \
        .for_each(TrainOneStep(workers)) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    merged_op = Concurrently([store_op, train_op],
                             mode="round_robin",
                             output_indexes=[1])

    return StandardMetricsReporting(merged_op, workers, config)


QMixTrainer = GenericOffPolicyTrainer.with_updates(
    name="QMIX",
    default_config=DEFAULT_CONFIG,
    default_policy=QMixTorchPolicy,
    get_policy_class=None,
    validate_config=validate_config,
    execution_plan=execution_plan)
Exemplo n.º 3
0
    "critic_tau": 0.01,
    "encoder_tau": 0.05,

    "learning_starts": 1000,
    "train_batch_size": 32,
    "gamma": 0.99,

    "initial_alpha": 0.1,

    # customs 
    "embed_dim": 128,
    "encoder_type": "pixel",
    "num_layers": 4,
    "num_filters": 32,
    "cropped_image_size": 54,
}
SAC_CONFIG = DEFAULT_CONFIG.copy()
SAC_CONFIG.update(new_config)



CurlSACTrainer = GenericOffPolicyTrainer.with_updates(
    name="CurlSAC",
    default_config=SAC_CONFIG,
    validate_config=validate_config,
    default_policy=CurlSACTorchPolicy,
    get_policy_class=get_sac_policy_class
)


Exemplo n.º 4
0
    "redo_invalid_games": False,
    "wandb": {},
    "ed": None,
    "policy_catalog": None,
    "eq_iters": None,
    "adaptive_pval_test": False,
    "br_thres": None,
    "eq_thres": None,
    "br_eval_against_policy": None,
    "thres_is_pval": None,
    "adaptive_pval": None
}

PG_CUSTOM_EVAL_TRAINER_DEFAULT_CONFIG = with_base_config(
    base_config=DEFAULT_CONFIG,
    extra_config=ppo_custom_eval_trainer_added_config_items)

mixins = [CustomEvaluationsTrainerMixin, WeightsUtilsTrainerMixin]

SACTrainer = GenericOffPolicyTrainer.with_updates(
    name="SACDiscrete",
    default_config=PG_CUSTOM_EVAL_TRAINER_DEFAULT_CONFIG,
    default_policy=SACTFPolicy,
    validate_config=validate_config,
    before_init=pg_custom_eval_trainer_before_init,
    after_init=pg_custom_eval_trainer_after_init,
    after_optimizer_step=after_optimizer_step,
    collect_metrics_fn=collect_metrics,
    make_policy_optimizer=make_optimizer,
    mixins=mixins)
Exemplo n.º 5
0
        "num_workers": 32,
        "buffer_size": 2000000,
        "learning_starts": 50000,
        "train_batch_size": 512,
        "rollout_fragment_length": 50,
        "target_network_update_freq": 500000,
        "timesteps_per_iteration": 1000,
        "exploration_config": {
            "type": "PerWorkerEpsilonGreedy"
        },
        "worker_side_prioritization": True,
        "min_iter_time_s": 30,
        "training_intensity": None,
        "prioritized_replay": True,
        "prioritized_replay_alpha": 0.6,
        "prioritized_replay_beta": 0.4,
        "final_prioritized_replay_beta": 0.4,
        "prioritized_replay_beta_annealing_timesteps": 20000,
        "prioritized_replay_eps": 1e-6,
    },
)

QMixTrainer = GenericOffPolicyTrainer.with_updates(
    name="QMIXApex",
    default_config=QMIX_APEX_DEFAULT_CONFIG,
    default_policy=QMixTorchPolicy,
    get_policy_class=None,
    execution_plan=apex_execution_plan)

register_trainable("QMIXApex", QMixTrainer)
Exemplo n.º 6
0

def setup_ddpg_exploration(trainer):
    trainer.exploration0 = make_exploration_schedule(trainer.config, -1)
    trainer.explorations = [
        make_exploration_schedule(trainer.config, i)
        for i in range(trainer.config["num_workers"])
    ]


def add_pure_exploration_phase(trainer):
    global_timestep = trainer.optimizer.num_steps_sampled
    pure_expl_steps = trainer.config["pure_exploration_steps"]
    if pure_expl_steps:
        # tell workers whether they should do pure exploration
        only_explore = global_timestep < pure_expl_steps
        trainer.workers.local_worker().foreach_trainable_policy(
            lambda p, _: p.set_pure_exploration_phase(only_explore))
        for e in trainer.workers.remote_workers():
            e.foreach_trainable_policy.remote(
                lambda p, _: p.set_pure_exploration_phase(only_explore))
    update_worker_explorations(trainer)


DDPGTrainer = GenericOffPolicyTrainer.with_updates(
    name="DDPG",
    default_config=DEFAULT_CONFIG,
    default_policy=DDPGTFPolicy,
    before_init=setup_ddpg_exploration,
    before_train_step=add_pure_exploration_phase)
Exemplo n.º 7
0
    else:
        return SACTFPolicy


def validate_config(config):
    if config.get("grad_norm_clipping", DEPRECATED_VALUE) != DEPRECATED_VALUE:
        deprecation_warning("grad_norm_clipping", "grad_clip")
        config["grad_clip"] = config.pop("grad_norm_clipping")

    # Use same keys as for standard Trainer "model" config.
    for model in ["Q_model", "policy_model"]:
        if config[model].get("hidden_activation", DEPRECATED_VALUE) != \
                DEPRECATED_VALUE:
            deprecation_warning("{}.hidden_activation".format(model),
                                "{}.fcnet_activation".format(model),
                                error=True)
        if config[model].get("hidden_layer_sizes", DEPRECATED_VALUE) != \
                DEPRECATED_VALUE:
            deprecation_warning("{}.hidden_layer_sizes".format(model),
                                "{}.fcnet_hiddens".format(model),
                                error=True)


DreamerTrainer = GenericOffPolicyTrainer.with_updates(
    name="Dreamer",
    default_config=DEFAULT_CONFIG,
    validate_config=validate_config,
    default_policy=DreamerTorchPolicy,
    get_policy_class=get_policy_class,
)
Exemplo n.º 8
0
    # Update the target network every `target_network_update_freq` steps.
    "target_network_update_freq": 0,

    # === Parallelism ===
    # Whether to use a GPU for local optimization.
    "num_gpus": 0,
    # Number of workers for collecting samples with. This only makes sense
    # to increase if your environment is particularly slow to sample, or if
    # you"re using the Async or Ape-X optimizers.
    "num_workers": 0,
    # Whether to allocate GPUs for workers (if > 0).
    "num_gpus_per_worker": 0,
    # Whether to allocate CPUs for workers (if > 0).
    "num_cpus_per_worker": 1,
    # Whether to compute priorities on workers.
    "worker_side_prioritization": False,
    # Prevent iterations from going lower than this time span
    "min_iter_time_s": 1,

    # TODO(ekl) these are unused; remove them from sac config
    "per_worker_exploration": False,
    "exploration_fraction": 0.1,
    "schedule_max_timesteps": 100000,
    "exploration_final_eps": 0.02,
})
# __sphinx_doc_end__
# yapf: enable

SACTrainer = GenericOffPolicyTrainer.with_updates(
    name="SAC", default_config=DEFAULT_CONFIG, default_policy=SACTFPolicy)
Exemplo n.º 9
0
Arquivo: maddpg.py Projeto: wsjeon/ray

class CustomStdOut(object):
    def _log_result(self, result):
        if result["training_iteration"] % 50 == 0:
            try:
                print(
                    "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                    .format(result["timesteps_total"],
                            result["episodes_total"],
                            result["episode_reward_mean"],
                            result["policy_reward_mean"],
                            round(result["time_total_s"] - self.cur_time, 3)))
            except:
                pass

            self.cur_time = result["time_total_s"]


MADDPGTrainer = GenericOffPolicyTrainer.with_updates(
    name="MADDPG",
    default_config=DEFAULT_CONFIG,
    default_policy=MADDPGTFPolicy,
    before_init=None,
    before_train_step=set_global_timestep,
    make_policy_optimizer=make_optimizer,
    after_train_result=add_trainer_metrics,
    collect_metrics_fn=collect_metrics,
    before_evaluate_fn=None,
    mixins=[CustomStdOut])
Exemplo n.º 10
0
    "explore": True,
    "exploration_config": {
        # Exploration sub-class by name or full path to module+class
        # (e.g. “ray.rllib.utils.exploration.epsilon_greedy.EpsilonGreedy”)
        "type": "EpsilonGreedy",
        # Parameters for the Exploration class' constructor:
        "initial_epsilon": 1.0,
        "final_epsilon": 0.1,
        "epsilon_timesteps": 20000,  # Timesteps over which to anneal epsilon.
    },
})

HERRainbowTrainer = GenericOffPolicyTrainer.with_updates(
    name="HER_RainbowDQN",
    default_policy=DQNTorchPolicy,
    #     .with_updates(
    #     postprocess_fn=postprocess_with_HER
    # ),
    default_config=HER_RAINBOW_DQN_CONFIG)

if __name__ == "__main__":
    ray.init()
    parser = argparse.ArgumentParser()
    parser.add_argument("--steps", type=int, default=1_000_000)
    args = parser.parse_args()
    tune.run(
        HERRainbowTrainer,
        config={
            "env": "CartPole-v1",
            "num_workers": 1,
            #  "num_gpus": 1,
Exemplo n.º 11
0
    "dueling": True,
    "n_step": 20,
    "learning_starts": 1600,

    # "critic_learning_rate": 1e-3,
    "lr": 1e-3,
    "critic_beta": 0.9,
    "encoder_learning_rate": 1e-3,
    # "adam_epsilon": 1e-7,
    "cpc_update_freq": 1,
    "target_network_update_freq": 2,
    "critic_tau": 0.01,
    "encoder_tau": 0.05,
    "train_batch_size": 32,
    "gamma": 0.99,

    # customs
    "embed_dim": 128,
    "encoder_type": "impala",
}
RAINBOW_CONFIG = DEFAULT_CONFIG.copy()
RAINBOW_CONFIG.update(new_config)

BaselineRainbowTrainer = GenericOffPolicyTrainer.with_updates(
    name="BaselineRainbow",
    default_config=RAINBOW_CONFIG,
    validate_config=validate_config,
    default_policy=BaselineRainbowTorchPolicy,
    get_policy_class=get_rainbow_policy_class,
)
Exemplo n.º 12
0
                        type=str,
                        choices=['simple_adversary', 'simple_crypto', 'simple_push', 'simple_tag'],
                        default='simple_push',
                        help="Scenario name of the multiagent-particle-envs")

    args = parser.parse_args()

    # Define the trainer
    def make_sync_batch_optimizer(workers, config):
        return SyncBatchReplayOptimizer(workers,
                                        learning_starts=config["learning_starts"],
                                        buffer_size=config["buffer_size"],
                                        train_batch_size=config["train_batch_size"])

    MADDPGTrainer = GenericOffPolicyTrainer.with_updates(name="MADDPG",
                                                         default_config=MADDPG_CONFIG,
                                                         default_policy=MADDPGTorchPolicy,
                                                         make_policy_optimizer=make_sync_batch_optimizer)

    # Registry Environment
    register_env("simple_multiagent", lambda config: SimpleMultiAgentEnv(config))
    single_env = SimpleMultiAgentEnv(env_config={"scenario_name": args.scenario_name})

    # Policy Mapping
    policies = {
        agent: (None, single_env.observation_space[agent], single_env.action_space[agent], {
            "observation_spaces": single_env.observation_space,
            "action_spaces": single_env.action_space,
            "agent_id": agent
        }) for agent in single_env.agent_ids
    }
Exemplo n.º 13
0

# Experimental distributed execution impl; enable with "use_exec_api": True.
def execution_plan(workers, config):
    rollouts = ParallelRollouts(workers, mode="bulk_sync")
    replay_buffer = SimpleReplayBuffer(config["buffer_size"])

    store_op = rollouts \
        .for_each(StoreToReplayBuffer(local_buffer=replay_buffer))

    train_op = Replay(local_buffer=replay_buffer) \
        .combine(
            ConcatBatches(min_batch_size=config["train_batch_size"])) \
        .for_each(TrainOneStep(workers)) \
        .for_each(UpdateTargetNetwork(
            workers, config["target_network_update_freq"]))

    merged_op = Concurrently(
        [store_op, train_op], mode="round_robin", output_indexes=[1])

    return StandardMetricsReporting(merged_op, workers, config)


QMixTrainer = GenericOffPolicyTrainer.with_updates(
    name="QMIX",
    default_config=DEFAULT_CONFIG,
    default_policy=QMixTorchPolicy,
    get_policy_class=None,
    make_policy_optimizer=make_sync_batch_optimizer,
    execution_plan=execution_plan)
Exemplo n.º 14
0
    if config["grad_clip"] is not None and config["grad_clip"] <= 0.0:
        raise ValueError("`grad_clip` value must be > 0.0!")


def get_policy_class(config: TrainerConfigDict) -> Optional[Type[Policy]]:
    """Policy class picker function. Class is chosen based on DL-framework.

    Args:
        config (TrainerConfigDict): The trainer's configuration dict.

    Returns:
        Optional[Type[Policy]]: The Policy class to use with PPOTrainer.
            If None, use `default_policy` provided in build_trainer().
    """
    if config["framework"] == "torch":
        from ray.rllib.agents.sac.sac_torch_policy import SACTorchPolicy
        return SACTorchPolicy


# Build a child class of `Trainer` (based on the kwargs used to create the
# GenericOffPolicyTrainer class and the kwargs used in the call below), which
# uses the framework specific Policy determined in `get_policy_class()` above.
SACTrainer = GenericOffPolicyTrainer.with_updates(
    name="SAC",
    default_config=DEFAULT_CONFIG,
    validate_config=validate_config,
    default_policy=SACTFPolicy,
    get_policy_class=get_policy_class,
)
Exemplo n.º 15
0
def add_maddpg_postprocessing(config):
    """Add the before learn on batch hook.

    This hook is called explicitly prior to TrainOneStep() in the execution
    setups for DQN and APEX.
    """

    def f(batch, workers, config):
        policies = dict(workers.local_worker()
                        .foreach_trainable_policy(lambda p, i: (i, p)))
        return before_learn_on_batch(batch, policies,
                                     config["train_batch_size"])

    config["before_learn_on_batch"] = f
    return config


MADDPGTrainer = GenericOffPolicyTrainer.with_updates(
    name="MADDPG",
    default_config=DEFAULT_CONFIG,
    default_policy=MADDPGTFPolicy,
    validate_config=add_maddpg_postprocessing,
    get_policy_class=None,
    before_init=None,
    before_train_step=set_global_timestep,
    make_policy_optimizer=make_optimizer,
    after_train_result=add_trainer_metrics,
    collect_metrics_fn=collect_metrics,
    before_evaluate_fn=None)
Exemplo n.º 16
0
Arquivo: sac.py Projeto: ddworak94/ray
        logger.warning("`simple_optimizer` must be True (or unset) for SAC!")
        config["simple_optimizer"] = True


def get_policy_class(config: TrainerConfigDict) -> Optional[Type[Policy]]:
    """Policy class picker function. Class is chosen based on DL-framework.

    Args:
        config (TrainerConfigDict): The trainer's configuration dict.

    Returns:
        Optional[Type[Policy]]: The Policy class to use with PPOTrainer.
            If None, use `default_policy` provided in build_trainer().
    """
    if config["framework"] == "torch":
        from ray.rllib.agents.sac.sac_torch_policy import SACTorchPolicy
        return SACTorchPolicy


# Build a child class of `Trainer` (based on the kwargs used to create the
# GenericOffPolicyTrainer class and the kwargs used in the call below), which
# uses the framework specific Policy determined in `get_policy_class()` above.
SACTrainer = GenericOffPolicyTrainer.with_updates(
    name="SAC",
    default_config=DEFAULT_CONFIG,
    validate_config=validate_config,
    default_policy=SACTFPolicy,
    get_policy_class=get_policy_class,
    allow_unknown_subkeys=["Q_model", "policy_model"],
)
Exemplo n.º 17
0
    # Whether to use a distribution of epsilons across workers for exploration.
    "per_worker_exploration": False,
    # Whether to compute priorities on workers.
    "worker_side_prioritization": False,
    # Prevent iterations from going lower than this time span
    "min_iter_time_s": 1,

    # === Model ===
    "model": {
        "lstm_cell_size": 64,
        "max_seq_len": 999999,
    },
})
# __sphinx_doc_end__
# yapf: enable


def make_sync_batch_optimizer(workers, config):
    return SyncBatchReplayOptimizer(
        workers,
        learning_starts=config["learning_starts"],
        buffer_size=config["buffer_size"],
        train_batch_size=config["train_batch_size"])


QMixTrainer = GenericOffPolicyTrainer.with_updates(
    name="QMIX",
    default_config=DEFAULT_CONFIG,
    default_policy=QMixTorchPolicy,
    make_policy_optimizer=make_sync_batch_optimizer)
Exemplo n.º 18
0
         for i, new_act in enumerate(new_act_n)})

    # Share samples among agents.
    policy_batches = {pid: SampleBatch(samples) for pid in policies.keys()}
    return MultiAgentBatch(policy_batches, train_batch_size)


def add_maddpg_postprocessing(config):
    """Add the before learn on batch hook.

    This hook is called explicitly prior to TrainOneStep() in the execution
    setups for DQN and APEX.
    """
    def f(batch, workers, config):
        policies = dict(workers.local_worker().foreach_trainable_policy(
            lambda p, i: (i, p)))
        return before_learn_on_batch(batch, policies,
                                     config["train_batch_size"])

    config["before_learn_on_batch"] = f
    return config


MADDPGTrainer = GenericOffPolicyTrainer.with_updates(
    name="MADDPG2",
    default_config=DEFAULT_CONFIG,
    default_policy=MADDPG2TFPolicy,
    get_policy_class=None,
    validate_config=add_maddpg_postprocessing,
)
Exemplo n.º 19
0
            raise ValueError("Prioritized replay is not supported when "
                             "replay_mode=lockstep.")
        elif config["replay_sequence_length"] > 1:
            raise ValueError("Prioritized replay is not supported when "
                             "replay_sequence_length > 1.")


def get_dqn_policy_class(config):
    return BaselineDQNTorchPolicy


#######################################################################################################
#####################################   Trainers   #####################################################
#######################################################################################################

new_config = {
    # customs
    "embed_dim": 256,
    "encoder_type": "impala",
}
DQN_CONFIG = DEFAULT_CONFIG.copy()
DQN_CONFIG.update(new_config)

BaselineDQNTrainer = GenericOffPolicyTrainer.with_updates(
    name="BaselineDQN",
    default_config=DQN_CONFIG,
    validate_config=validate_config,
    default_policy=BaselineDQNTorchPolicy,
    get_policy_class=get_dqn_policy_class,
)