Exemplo n.º 1
0
    def test_custom_multi_action_distribution(self):
        class Model:
            pass

        ray.init(
            object_store_memory=1000 * 1024 * 1024, ignore_reinit_error=True
        )  # otherwise fails sometimes locally
        # registration
        ModelCatalog.register_custom_action_dist("test", CustomMultiActionDistribution)
        s1 = Discrete(5)
        s2 = Box(0, 1, shape=(3,), dtype=np.float32)
        spaces = dict(action_1=s1, action_2=s2)
        action_space = Dict(spaces)
        # test retrieving it
        model_config = MODEL_DEFAULTS.copy()
        model_config["custom_action_dist"] = "test"
        dist_cls, param_shape = ModelCatalog.get_action_dist(action_space, model_config)
        self.assertIsInstance(dist_cls, partial)
        self.assertEqual(param_shape, s1.n + 2 * s2.shape[0])

        # test the class works as a distribution
        dist_input = tf1.placeholder(tf.float32, (None, param_shape))
        model = Model()
        model.model_config = model_config
        dist = dist_cls(dist_input, model=model)
        self.assertIsInstance(dist.sample(), dict)
        self.assertIn("action_1", dist.sample())
        self.assertIn("action_2", dist.sample())
        self.assertEqual(dist.sample()["action_1"].dtype, tf.int64)
        self.assertEqual(dist.sample()["action_2"].shape[1:], s2.shape)

        with self.assertRaises(NotImplementedError):
            dist.entropy()
Exemplo n.º 2
0
    def testCustomActionDistribution(self):
        ray.init()
        # registration
        ModelCatalog.register_custom_action_dist("test",
                                                 CustomActionDistribution)
        action_space = Box(0, 1, shape=(5, 3), dtype=np.float32)

        # test retrieving it
        model_config = MODEL_DEFAULTS.copy()
        model_config["custom_action_dist"] = "test"
        dist_cls, param_shape = ModelCatalog.get_action_dist(
            action_space, model_config)
        self.assertEqual(str(dist_cls), str(CustomActionDistribution))
        self.assertEqual(param_shape, action_space.shape)

        # test the class works as a distribution
        dist_input = tf.placeholder(tf.float32, (None, ) + param_shape)
        dist = dist_cls(dist_input, model_config=model_config)
        self.assertEqual(dist.sample().shape[1:], dist_input.shape[1:])
        self.assertIsInstance(dist.sample(), tf.Tensor)
        with self.assertRaises(NotImplementedError):
            dist.entropy()

        # test passing the options to it
        model_config["custom_options"].update({"output_dim": (3, )})
        dist_cls, param_shape = ModelCatalog.get_action_dist(
            action_space, model_config)
        self.assertEqual(param_shape, (3, ))
        dist_input = tf.placeholder(tf.float32, (None, ) + param_shape)
        dist = dist_cls(dist_input, model_config=model_config)
        self.assertEqual(dist.sample().shape[1:], dist_input.shape[1:])
        self.assertIsInstance(dist.sample(), tf.Tensor)
        with self.assertRaises(NotImplementedError):
            dist.entropy()
Exemplo n.º 3
0
def initialize():
    ray.init()
    register_env("coverage", lambda config: CoverageEnv(config))
    register_env("path_planning", lambda config: PathPlanningEnv(config))
    ModelCatalog.register_custom_model("adversarial", AdversarialModel)
    ModelCatalog.register_custom_action_dist(
        "hom_multi_action", TorchHomogeneousMultiActionDistribution)
Exemplo n.º 4
0
def register_mixture_action_distribution():
    ModelCatalog.register_custom_action_dist(GaussianMixture.name,
                                             GaussianMixture)
    ModelCatalog.register_custom_action_dist(DeterministicMixture.name,
                                             DeterministicMixture)
    print("Successfully register GaussianMixture and DeterministicMixture "
          "action distribution.")
Exemplo n.º 5
0
    def __init__(self, load_path, algorithm, policy_name, observation_space, action_space):
        self._checkpoint_path = load_path
        self._policy_name = policy_name
        self._observation_space = observation_space
        self._action_space = action_space
        self._prep = ModelCatalog.get_preprocessor_for_space(self._observation_space)
        flat_obs_space = self._prep.observation_space

        ray.init(ignore_reinit_error=True, local_mode=True)

        from utils.ppo_policy import PPOTorchPolicy as LoadPolicy
        ModelCatalog.register_custom_action_dist("my_dist", TorchRepeatDistribution)
        config = ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG.copy()
        config['num_workers'] = 0
        config["model"]["use_lstm"] = True
        config['model']['free_log_std'] = False
        config["model"]["custom_action_dist"] =  "my_dist"

        self.policy = LoadPolicy(flat_obs_space, self._action_space, config)
        objs = pickle.load(open(self._checkpoint_path, "rb"))
        objs = pickle.loads(objs["worker"])
        state = objs["state"]
        filters = objs["filters"]
        self.filters = filters[self._policy_name]
        weights = state[self._policy_name]
        weights.pop("_optimizer_variables")
        self.policy.set_weights(weights)
        self.model = self.policy.model

        self.rnn_state = self.model.get_initial_state()
        self.rnn_state = [self.rnn_state[0].unsqueeze(0),self.rnn_state[1].unsqueeze(0)]
Exemplo n.º 6
0
def setup_ma_config(config, create_env):
    env = create_env(config['env_config'])
    policies_to_train = ['agent']

    num_adversaries = config['env_config']['num_adv_strengths'] * config['env_config']['advs_per_strength']
    if num_adversaries == 0:
        return
    adv_policies = ['adversary' + str(i) for i in range(num_adversaries)]
    adversary_config = {"model": {'fcnet_hiddens': [64, 64], 'use_lstm': False}, "entropy_coeff": config['env_config']['entropy_coeff']}
    if config['env_config']['run'] == 'PPO':
        if config['env_config']['kl_reward']:
            ModelCatalog.register_custom_action_dist("logits_dist", LogitsDist)
            adversary_config['model']['custom_action_dist'] = "logits_dist"
        # for both of these we need a graph that zeros out agents that weren't active
        if config['env_config']['kl_reward'] or (config['env_config']['l2_reward'] and not config['env_config']['l2_memory']):
            policy_graphs = {'agent': (PPOTFPolicy, env.observation_space, env.action_space, {})}
            policy_graphs.update({adv_policies[i]: (CustomPPOPolicy, env.adv_observation_space,
                                                    env.adv_action_space, adversary_config) for i in
                                  range(num_adversaries)})
        else:
            policy_graphs = {'agent': (PPOTFPolicy, env.observation_space, env.action_space, {})}
            policy_graphs.update({adv_policies[i]: (PPOTFPolicy, env.adv_observation_space,
                                                    env.adv_action_space, adversary_config) for i in range(num_adversaries)})
    elif config['env_config']['run'] == 'TD3':
        policy_graphs = {'agent': (DDPGTFPolicy, env.observation_space, env.action_space, {})}
        policy_graphs.update({adv_policies[i]: (DDPGTFPolicy, env.adv_observation_space,
                                                env.adv_action_space, adversary_config) for i in range(num_adversaries)})
    
    # policy_graphs.update({adv_policies[i]: (CustomPPOPolicy, env.adv_observation_space,
    #                                         env.adv_action_space, adversary_config) for i in range(num_adversaries)})

    print("========= Policy Graphs ==========")
    print(policy_graphs)

    policies_to_train += adv_policies

    def policy_mapping_fn(agent_id):
        return agent_id

    config.update({
        'multiagent': {
            'policies': policy_graphs,
            'policy_mapping_fn': policy_mapping_fn,
            'policies_to_train': policies_to_train
        }
    })
    print({'multiagent': {
            'policies': policy_graphs,
            'policy_mapping_fn': policy_mapping_fn,
            'policies_to_train': policies_to_train
        }})
Exemplo n.º 7
0
    def test_custom_action_distribution(self):
        class Model():
            pass

        ray.init(object_store_memory=1000 * 1024 * 1024,
                 ignore_reinit_error=True)  # otherwise fails sometimes locally
        # registration
        ModelCatalog.register_custom_action_dist("test",
                                                 CustomActionDistribution)
        action_space = Box(0, 1, shape=(5, 3), dtype=np.float32)

        # test retrieving it
        model_config = MODEL_DEFAULTS.copy()
        model_config["custom_action_dist"] = "test"
        dist_cls, param_shape = ModelCatalog.get_action_dist(
            action_space, model_config)
        self.assertEqual(str(dist_cls), str(CustomActionDistribution))
        self.assertEqual(param_shape, action_space.shape)

        # test the class works as a distribution
        dist_input = tf1.placeholder(tf.float32, (None, ) + param_shape)
        model = Model()
        model.model_config = model_config
        dist = dist_cls(dist_input, model=model)
        self.assertEqual(dist.sample().shape[1:], dist_input.shape[1:])
        self.assertIsInstance(dist.sample(), tf.Tensor)
        with self.assertRaises(NotImplementedError):
            dist.entropy()

        # test passing the options to it
        model_config["custom_model_config"].update({"output_dim": (3, )})
        dist_cls, param_shape = ModelCatalog.get_action_dist(
            action_space, model_config)
        self.assertEqual(param_shape, (3, ))
        dist_input = tf1.placeholder(tf.float32, (None, ) + param_shape)
        model.model_config = model_config
        dist = dist_cls(dist_input, model=model)
        self.assertEqual(dist.sample().shape[1:], dist_input.shape[1:])
        self.assertIsInstance(dist.sample(), tf.Tensor)
        with self.assertRaises(NotImplementedError):
            dist.entropy()
Exemplo n.º 8
0
def persuasive_a3c_conf(rollout_size=10,
                        agents=100,
                        debug_folder=None,
                        eval_folder=None,
                        alpha=0.0001,
                        gamma=0.99):
    """
        https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/agents/trainer.py#L44
        https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/agents/a3c/a3c.py#L14
        https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/models/catalog.py#L37
    """

    ModelCatalog.register_custom_model('custom_rrn', RNNModel)
    ModelCatalog.register_custom_action_dist(
        "custom_action_distribution", PersuasiveActionDistribution)

    custom_configuration = DEFAULT_CONFIG

    custom_configuration['collect_metrics_timeout'] = 86400 # a day
    custom_configuration['framework'] = 'tf'
    custom_configuration['ignore_worker_failures'] = True
    custom_configuration['log_level'] = 'WARN'
    custom_configuration['monitor'] = True
    custom_configuration['num_cpus_for_driver'] = 1
    custom_configuration['num_cpus_per_worker'] = 1
    custom_configuration['num_envs_per_worker'] = 1
    custom_configuration['num_gpus_per_worker'] = 1
    custom_configuration['num_gpus'] = 1
    custom_configuration['num_workers'] = 1
    custom_configuration['output'] = debug_folder
    custom_configuration['remote_env_batch_wait_ms'] = 1000
    custom_configuration['remote_worker_envs'] = False
    custom_configuration['seed'] = 42
    custom_configuration['timesteps_per_iteration'] = 1

    # === Environment Settings ===
    custom_configuration['batch_mode'] = 'complete_episodes'
    custom_configuration['gamma'] = gamma
    custom_configuration['lr'] = alpha
    custom_configuration['no_done_at_end'] = False
    # Divide episodes into fragments of this many steps each during rollouts.
    # Sample batches of this size are collected from rollout workers and
    # combined into a larger batch of `train_batch_size` for learning.
    # For example, given rollout_fragment_length=100 and train_batch_size=1000:
    #   1. RLlib collects 10 fragments of 100 steps each from rollout workers.
    #   2. These fragments are concatenated and we perform an epoch of SGD.
    # When using multiple envs per worker, the fragment size is multiplied by
    # `num_envs_per_worker`. This is since we are collecting steps from
    # multiple envs in parallel. For example, if num_envs_per_worker=5, then
    # rollout workers will return experiences in chunks of 5*100 = 500 steps.
    # The dataflow here can vary per algorithm. For example, PPO further
    # divides the train batch into minibatches for multi-epoch SGD.
    custom_configuration['rollout_fragment_length'] = rollout_size
    # Training batch size, if applicable. Should be >= rollout_fragment_length.
    # Samples batches will be concatenated together to a batch of this size,
    # which is then passed to SGD.
    custom_configuration['train_batch_size'] = rollout_size * agents

    # === Exploration Settings ===
    # https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/utils/exploration/stochastic_sampling.py
    # custom_configuration['exploration_config']['type'] = 'StochasticSampling'

    # https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/utils/exploration/epsilon_greedy.py
    custom_configuration['exploration_config']['type'] = 'EpsilonGreedy'
    custom_configuration['exploration_config']['initial_epsilon'] = 1.0
    custom_configuration['exploration_config']['final_epsilon'] = 0.0001

    # ==================== MODEL - DEFAULT ====================
    # custom_configuration['model']['fcnet_hiddens'] = [64, 64]
    # === Built-in options ===
    # Filter config. List of [out_channels, kernel, stride] for each filter
    # custom_configuration['model']['conv_filters'] = None
    # Nonlinearity for built-in convnet
    # custom_configuration['model']['conv_activation'] = "relu"
    # Nonlinearity for fully connected net (tanh, relu)
    # custom_configuration['model']['fcnet_activation'] = "tanh"
    # Number of hidden layers for fully connected net
    # custom_configuration['model']['fcnet_hiddens'] = [64, 64]
    # For DiagGaussian action distributions, make the second half of the model
    # outputs floating bias variables instead of state-dependent. This only
    # has an effect is using the default fully connected net.
    # custom_configuration['model']['free_log_std'] = False
    # Whether to skip the final linear layer used to resize the hidden layer
    # outputs to size `num_outputs`. If True, then the last hidden layer
    # should already match num_outputs.
    # custom_configuration['model']['no_final_linear'] = False
    # Whether layers should be shared for the value function.
    # custom_configuration['model']['vf_share_layers'] = True

    # == LSTM ==
    # Whether to wrap the model with an LSTM.
    # custom_configuration['model']['use_lstm'] = False
    # Max seq len for training the LSTM, defaults to 20.
    # custom_configuration['model']['max_seq_len'] = 20
    # Size of the LSTM cell.
    # custom_configuration['model']['lstm_cell_size'] = 64
    # Whether to feed a_{t-1}, r_{t-1} to LSTM.
    # custom_configuration['model']['lstm_use_prev_action_reward'] = False
    # When using modelv1 models with a modelv2 algorithm, you may have to
    # define the state shape here (e.g., [256, 256]).
    # custom_configuration['model']['state_shape'] = None # [64, 64]

    # == Atari ==
    # Whether to enable framestack for Atari envs
    # custom_configuration['model']['framestack'] = True
    # Final resized frame dimension
    # custom_configuration['model']['dim'] = 84
    # (deprecated) Converts ATARI frame to 1 Channel Grayscale image
    # custom_configuration['model']['grayscale'] = False
    # (deprecated) Changes frame to range from [-1, 1] if true
    # custom_configuration['model']['zero_mean'] = True

    # === Options for custom models ===
    # Name of a custom model to use
    custom_configuration['model']['custom_model'] = 'custom_rrn'
    # Extra options to pass to the custom classes.
    # These will be available in the Model's
    custom_configuration['model']['custom_model_config'] = {}
    # Name of a custom action distribution to use.
    # See: https://docs.ray.io/en/releases-1.0.0/rllib-models.html#custom-action-distributions
    custom_configuration['model']['custom_action_dist'] = 'custom_action_distribution'

    # == OPTIMIZER ==
    # Arguments to pass to the policy optimizer. These vary by optimizer.
    # custom_configuration['optimizer'] = {}

    # == Persuasive A3C ==
    custom_configuration['callbacks'] = PersuasiveCallbacks
    custom_configuration['min_iter_time_s'] = 5

    custom_configuration['use_gae'] = True

    # === Evaluation Settings ===
    # Evaluate with every `evaluation_interval` training iterations.
    # The evaluation stats will be reported under the "evaluation" metric key.
    # Note that evaluation is currently not parallelized, and that for Ape-X
    # metrics are already only reported for the lowest epsilon workers.
    custom_configuration['evaluation_interval'] = 5

    # Number of episodes to run per evaluation period. If using multiple
    # evaluation workers, we will run at least this many episodes total.
    custom_configuration['evaluation_num_episodes'] = 5

    # Internal flag that is set to True for evaluation workers.
    # DEFAUTL: 'in_evaluation': False,

    # Typical usage is to pass extra args to evaluation env creator
    # and to disable exploration by computing deterministic actions.
    # IMPORTANT NOTE: Policy gradient algorithms are able to find the optimal
    # policy, even if this is a stochastic one. Setting 'explore=False' here
    # will result in the evaluation workers not using this optimal policy!
    custom_configuration['evaluation_config']['explore'] = False
    custom_configuration['evaluation_config']['lr'] = 0
    custom_configuration['evaluation_config']['num_gpus_per_worker'] = 0
    custom_configuration['evaluation_config']['num_gpus'] = 0
    custom_configuration['evaluation_config']['output'] = eval_folder
    # custom_configuration['evaluation_config']['env_config'] = {...},

    # Number of parallel workers to use for evaluation. Note that this is set
    # to zero by default, which means evaluation will be run in the trainer
    # process. If you increase this, it will increase the Ray resource usage
    # of the trainer since evaluation workers are created separately from
    # rollout workers.
    custom_configuration['evaluation_num_workers'] = 1

    # Customize the evaluation method. This must be a function of signature
    # (trainer: Trainer, eval_workers: WorkerSet) -> metrics: dict. See the
    # Trainer._evaluate() method to see the default implementation. The
    # trainer guarantees all eval workers have the latest policy state before
    # this function is called.
    custom_configuration['custom_eval_function'] = None #custom_eval_function

    return custom_configuration
Exemplo n.º 9
0
#             "shuffle_sequences": True,
#             "num_sgd_iter": 15,
#             "lr": 5e-5,
#             "vf_share_layers": True,
#             "vf_loss_coeff": 0.5,
#             "entropy_coeff": 0.001,
#             "entropy_coeff_schedule": None,
#             "clip_param": 0.2,
#             "kl_target": 0.01,
#             "grad_clip": 5.0,
#             "gamma": 0.999,
#             "sample_batch_size": 128,
#             "train_batch_size": 1024
#             }
# config.update(ppo_conf)
ModelCatalog.register_custom_action_dist("my_dist", MultiOrdinal)

config["num_gpus"] = args.ngpu  # used for trainer process
config["num_workers"] = args.ncpu
config['num_envs_per_worker'] = 1
config['env_config'] = envconf
config['horizon'] = envconf['max_steps']
config['model'] = {
    "custom_model": 'deepdrug3d',
    'custom_action_dist': 'my_dist'
}
# trainer = ppo.PPOTrainer(config=config, env='lactamase_docking')
# # trainer.restore('/homes/aclyde11/ray_results/PPO_lactamase_docking_2019-11-22_16-34-28igjfjjyh/checkpoint_1052/checkpoint-1052')
# policy = trainer.get_policy()
# print(policy.model.base_model.summary())
#
Exemplo n.º 10
0
def persuasive_ppo_conf(rollout_size=10,
                        agents=100,
                        true_obs_shape=None,
                        action_embed_size=None,
                        debug_folder=None,
                        eval_folder=None,
                        alpha=5e-5,
                        gamma=0.99):
    """
        https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/agents/trainer.py#L44
        https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/agents/ppo/ppo.py#L15
        https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/models/catalog.py#L37
    """

    ModelCatalog.register_custom_action_dist('custom_action_distribution',
                                             PersuasiveActionDistribution)

    ModelCatalog.register_custom_model('custom_model',
                                       OwnershipActionMaskingModel)

    custom_configuration = ppo.DEFAULT_CONFIG.copy()

    # custom_configuration['collect_metrics_timeout'] = 86400 # a day
    custom_configuration['framework'] = 'tf'
    custom_configuration['ignore_worker_failures'] = True
    custom_configuration['log_level'] = 'WARN'
    custom_configuration['monitor'] = True
    custom_configuration['num_gpus'] = 0
    custom_configuration['num_cpus_for_driver'] = 1
    custom_configuration['num_cpus_per_worker'] = 1
    custom_configuration['num_envs_per_worker'] = 1
    # custom_configuration['output'] = debug_folder
    # custom_configuration['remote_env_batch_wait_ms'] = 1000
    # custom_configuration['remote_worker_envs'] = False
    custom_configuration['seed'] = 42

    # === Parallelism ===
    # Number of workers for collecting samples with.  = deepcopy(policy_conf)
    # trainer.setup(dThis only makes sense
    # to increase if your environment is particularly slow to sample, or if
    # you"re using the Async or Ape-X optimizers.
    custom_configuration['num_workers'] = 1
    custom_configuration['num_gpus_per_worker'] = 0
    # Prevent iterations from going lower than this time span
    # custom_configuration['min_iter_time_s'] = 1

    # === Environment Settings ===
    custom_configuration['batch_mode'] = 'complete_episodes'
    custom_configuration['callbacks'] = PersuasiveCallbacks
    custom_configuration['gamma'] = gamma
    custom_configuration['lr'] = alpha
    custom_configuration['lr_schedule'] = None
    custom_configuration['no_done_at_end'] = False

    # === Exploration Settings ===
    # custom_configuration['exploration_config'] = {}

    # # https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/utils/exploration/epsilon_greedy.py
    # custom_configuration['exploration_config']['type'] = 'EpsilonGreedy'
    # custom_configuration['exploration_config']['initial_epsilon'] = 1.0
    # custom_configuration['exploration_config']['final_epsilon'] = 0.02
    # custom_configuration['exploration_config']['epsilon_timesteps'] = 10000

    # https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/utils/exploration/soft_q.py
    # custom_configuration['exploration_config']['type'] = 'SoftQ'
    # custom_configuration['exploration_config']['temperature'] = 1.0 # Default

    # Name of a custom action distribution to use.
    # See: https://docs.ray.io/en/releases-1.0.0/rllib-models.html#custom-action-distributions
    # custom_configuration['model']['custom_action_dist'] = 'custom_action_distribution'

    # === PPO Model Settings ===
    # Should use a critic as a baseline (otherwise don't use value baseline;
    # required for using GAE).
    custom_configuration['use_critic'] = True
    # If true, use the Generalized Advantage Estimator (GAE)
    # with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
    custom_configuration['use_gae'] = True
    # The GAE(lambda) parameter.
    custom_configuration['lambda'] = 1.0
    # Initial coefficient for KL divergence.
    custom_configuration['kl_coeff'] = 0.2
    # Size of batches collected from each worker.
    custom_configuration['rollout_fragment_length'] = 10  #100
    # Number of timesteps collected for each SGD round. This defines the size
    # of each SGD epoch.
    custom_configuration['train_batch_size'] = 100  #1000
    # Total SGD batch size across all devices for SGD. This defines the
    # minibatch size within each epoch.
    custom_configuration['sgd_minibatch_size'] = 64  #128 #64
    # Whether to shuffle sequences in the batch when training (recommended).
    custom_configuration['shuffle_sequences'] = True
    # Number of SGD iterations in each outer loop (i.e., number of epochs to
    # execute per train batch).
    custom_configuration['num_sgd_iter'] = 30
    # Share layers for value function. If you set this to True, it's important
    # to tune vf_loss_coeff.
    custom_configuration['vf_share_layers'] = False
    # Coefficient of the value function loss. IMPORTANT: you must tune this if
    # you set vf_share_layers: True.
    custom_configuration['vf_loss_coeff'] = 1.0
    # Coefficient of the entropy regularizer.
    custom_configuration['entropy_coeff'] = 0.0
    # Decay schedule for the entropy regularizer.
    custom_configuration['entropy_coeff_schedule'] = None
    # PPO clip parameter.
    custom_configuration['clip_param'] = 0.3
    # Clip param for the value function. Note that this is sensitive to the
    # scale of the rewards. If your expected V is large, increase this.
    custom_configuration['vf_clip_param'] = 10.0
    # If specified, clip the global norm of gradients by this amount.
    custom_configuration['grad_clip'] = None
    # Target value for KL divergence.
    custom_configuration['kl_target'] = 0.01
    # Which observation filter to apply to the observation.
    custom_configuration['observation_filter'] = "NoFilter"
    # Uses the sync samples optimizer instead of the multi-gpu one. This is
    # usually slower, but you might want to try it if you run into issues with
    # the default optimizer.
    custom_configuration['simple_optimizer'] = False
    # Whether to fake GPUs (using CPUs).
    # Set this to True for debugging on non-GPU machines (set `num_gpus` > 0).
    custom_configuration['_fake_gpus'] = False

    # === MODEL ===
    custom_configuration['model']['use_lstm'] = False
    custom_configuration['model']['custom_model'] = 'custom_model'
    custom_configuration['model']['custom_model_config'][
        'true_obs_shape'] = true_obs_shape
    custom_configuration['model']['custom_model_config'][
        'action_embed_size'] = action_embed_size

    # === Evaluation Settings ===
    # Evaluate with every `evaluation_interval` training iterations.
    # The evaluation stats will be reported under the "evaluation" metric key.
    # Note that evaluation is currently not parallelized, and that for Ape-X
    # metrics are already only reported for the lowest epsilon workers.
    custom_configuration['evaluation_interval'] = 1
    # Number of episodes to run per evaluation period. If using multiple
    # evaluation workers, we will run at least this many episodes total.
    custom_configuration['evaluation_num_episodes'] = 5
    # Internal flag that is set to True for evaluation workers.
    # DEFAUTL: 'in_evaluation': False,
    # Typical usage is to pass extra args to evaluation env creator
    # and to disable exploration by computing deterministic actions.
    # IMPORTANT NOTE: Policy gradient algorithms are able to find the optimal
    # policy, even if this is a stochastic one. Setting 'explore=False' here
    # will result in the evaluation workers not using this optimal policy!
    custom_configuration['evaluation_config']['explore'] = True
    custom_configuration['evaluation_config']['lr'] = 0
    custom_configuration['evaluation_config']['num_gpus_per_worker'] = 0
    custom_configuration['evaluation_config']['num_gpus'] = 0
    # custom_configuration['evaluation_config']['output'] = eval_folder
    # custom_configuration['evaluation_config']['env_config'] = {...},
    # Number of parallel workers to use for evaluation. Note that this is set
    # to zero by default, which means evaluation will be run in the trainer
    # process. If you increase this, it will increase the Ray resource usage
    # of the trainer since evaluation workers are created separately from
    # rollout workers.
    custom_configuration['evaluation_num_workers'] = 1
    # Customize the evaluation method. This must be a function of signature
    # (trainer: Trainer, eval_workers: WorkerSet) -> metrics: dict. See the
    # Trainer._evaluate() method to see the default implementation. The
    # trainer guarantees all eval workers have the latest policy state before
    # this function is called.
    custom_configuration['custom_eval_function'] = None  #custom_eval_function

    return custom_configuration
Exemplo n.º 11
0
                         "receiver")


class DeterministicSenderDist(DeterministicMessageActionDistribution):
    def __init__(self, inputs, model):
        super().__init__(inputs, model, sample_env.action_space["sender"],
                         "sender")


class DeterministicReceiverDist(DeterministicMessageActionDistribution):
    def __init__(self, inputs, model):
        super().__init__(inputs, model, sample_env.action_space["receiver"],
                         "receiver")


ModelCatalog.register_custom_action_dist("sender_dist", SenderDist)
ModelCatalog.register_custom_action_dist("receiver_dist", ReceiverDist)
ModelCatalog.register_custom_action_dist("deterministic_sender_dist",
                                         DeterministicSenderDist)
ModelCatalog.register_custom_action_dist("deterministic_receiver_dist",
                                         DeterministicReceiverDist)

configs = []

default_config = {
    # "run": "IMPALA",
    "run":
    ImpalaCPCSaTrainer,
    "tot_steps":
    1e9,
    "max_workers":
Exemplo n.º 12
0
)

parser = argparse.ArgumentParser()
parser.add_argument('--run', type=str, default='PG')  # A3C
parser.add_argument('--stop', type=int, default=180)

if __name__ == '__main__':
    args = parser.parse_args()
    ray.init()
    register_env('cart_poles_env', lambda env_config: CartPolesEnv(env_config))
    register_env('cart_poles_stacked_env',
                 lambda env_config: CartPolesStackedEnv(env_config))
    ModelCatalog.register_custom_model('cart_poles_model', CartPolesModel)
    ModelCatalog.register_custom_model('cart_poles_stacked_model',
                                       CartPolesStackedModel)
    ModelCatalog.register_custom_action_dist('cart_poles_action_dist',
                                             CartPolesActionDist)

    tune.run(
        args.run,
        stop={'episode_reward_mean': args.stop},
        config={
            'env': 'cart_poles_stacked_env',
            # 'gamma': 0.99,
            'num_workers': 3,
            'model': {
                'custom_model': 'cart_poles_stacked_model',
                'custom_action_dist': 'cart_poles_action_dist',
            },
        })
Exemplo n.º 13
0
def main(args):
    # ====================================
    # init env config
    # ====================================
    if args.no_debug:
        ray.init(webui_host="127.0.0.1")
    else:
        ray.init(local_mode=True, webui_host="127.0.0.1")
    # use ray cluster for training
    # ray.init(
    #     address="auto" if args.address is None else args.address,
    #     redis_password="******",
    # )
    #
    # print(
    #     "--------------- Ray startup ------------\n{}".format(
    #         ray.state.cluster_resources()
    #     )
    # )

    agent_specs = {"AGENT-007": agent_spec}

    env_config = {
        "seed": 42,
        "scenarios": [scenario_paths],
        "headless": args.headless,
        "agent_specs": agent_specs,
    }

    # ====================================
    # init tune config
    # ====================================
    class MultiEnv(RLlibHiWayEnv):
        def __init__(self, env_config):
            env_config["scenarios"] = [
                scenario_paths[(env_config.worker_index - 1) %
                               len(scenario_paths)]
            ]
            super(MultiEnv, self).__init__(config=env_config)

        def step(self, agent_actions):
            for agent_id in agent_actions:
                repeat = int(agent_actions[agent_id][-1])
            agent_actions = {
                agent_id: actions[:-1]
                for agent_id, actions in agent_actions.items()
            }

            if self.pre_act is not None and repeat > 0:
                obs, r, done, info = super().step(self.pre_act)
            else:
                obs, r, done, info = super().step(agent_actions)

            self.pre_act = agent_actions

            return obs, r, done, info

        def reset(self):
            self.pre_act = None
            return super().reset()

    ModelCatalog.register_custom_model("my_fc", FullyConnectedNetwork)
    ModelCatalog.register_custom_action_dist("my_dist", TorchDyDistribution)
    tune_config = {
        "env": MultiEnv,
        "env_config": env_config,
        "multiagent": {
            "policies": {
                "default_policy": (
                    None,
                    OBSERVATION_SPACE,
                    ACTION_SPACE,
                    {},
                )
            },
            "policy_mapping_fn": lambda agent_id: "default_policy",
        },
        "model": {
            "custom_model": "my_fc",
            "custom_action_dist": "my_dist",
        },
        "framework": "torch",
        "callbacks": {
            "on_episode_start": on_episode_start,
            "on_episode_step": on_episode_step,
            "on_episode_end": on_episode_end,
        },
        "lr": 1e-4,
        "log_level": "WARN",
        "num_workers": args.num_workers,
        "horizon": args.horizon,
        "train_batch_size": 5120 * 3,

        # "observation_filter": "MeanStdFilter",
        # "batch_mode": "complete_episodes",
        # "grad_clip": 0.5,

        # "model":{
        #     "use_lstm": True,
        # },
    }

    tune_config.update({
        "lambda": 0.95,
        "clip_param": 0.2,
        "num_sgd_iter": 10,
        "sgd_minibatch_size": 512,
        "gamma": 0.995,
        "seed_global": tune.grid_search([10, 20, 30, 40])
    })

    # ====================================
    # init log and checkpoint dir_info
    # ====================================
    experiment_name = EXPERIMENT_NAME.format(
        scenario=args.exper,
        algorithm="PPO",
        n_agent=1,
    )

    log_dir = Path(args.log_dir).expanduser().absolute() / RUN_NAME
    log_dir.mkdir(parents=True, exist_ok=True)
    print(f"Checkpointing at {log_dir}")

    if args.restore:
        restore_path = Path(args.restore).expanduser()
        print(f"Loading model from {restore_path}")
    else:
        restore_path = None

    # run experiments
    analysis = tune.run(
        PPOTrainer,
        # "PPO",
        name=experiment_name,
        stop={"timesteps_total": 10000000},
        checkpoint_freq=20,
        checkpoint_at_end=True,
        local_dir=str(log_dir),
        resume=args.resume,
        restore=restore_path,
        max_failures=1000,
        export_formats=["model", "checkpoint"],
        config=tune_config,
    )

    print(analysis.dataframe().head())
Exemplo n.º 14
0
    def setup(self, cfg: DictConfig):
        """
        This method initializes and registers all necessary maze components with RLlib

        :param cfg: Full Hydra run job config
        """
        # Generate a random state used for sampling random seeds for the envs and agents
        self.maze_seeding = MazeSeeding(cfg.seeding.env_base_seed,
                                        cfg.seeding.agent_base_seed,
                                        cfg.seeding.cudnn_determinism_flag)

        self._cfg = cfg

        # Initialize env factory (with rllib monkey patches)
        self.env_factory = build_maze_rllib_env_factory(cfg)

        # Register maze env factory with rllib
        tune.register_env("maze_env", lambda x: self.env_factory())

        # Register maze model and distribution mapper if a maze model should be used
        # Check whether we are using the rllib default model composer or a maze model
        using_rllib_model_composer = '_target_' not in cfg.model.keys()
        if not using_rllib_model_composer:
            # Get model class
            model_cls = Factory(MazeRLlibBaseModel).type_from_name(
                cfg.algorithm.model_cls)
            # Register maze model
            ModelCatalog.register_custom_model("maze_model", model_cls)

            if 'policy' in cfg.model and "networks" in cfg.model.policy:
                assert len(cfg.model.policy.networks
                           ) == 1, 'Hierarchical envs are not yet supported'

            # register maze action distribution
            ModelCatalog.register_custom_action_dist(
                'maze_dist', MazeRLlibActionDistribution)
            model_config = {
                "custom_action_dist": 'maze_dist',
                "custom_model": "maze_model",
                "vf_share_layers": False,
                "custom_model_config": {
                    "maze_model_composer_config": cfg.model,
                    'spaces_config_dump_file': self.spaces_config_dump_file,
                    'state_dict_dump_file': self.state_dict_dump_file
                }
            }
        else:
            # If specified use the default rllib model builder
            model_config = OmegaConf.to_container(cfg.model, resolve=True)

        # Build rllib config
        maze_rllib_config = {
            "env": "maze_env",
            # Store env config for possible later use
            "env_config": {
                'env': cfg.env,
                'wrappers': cfg.wrappers
            },
            "model": model_config,
            'callbacks': MazeRLlibLoggingCallbacks,
            "framework": "torch"
        }
        # Load the algorithm config and update the custom parameters
        rllib_config: Dict = OmegaConf.to_container(cfg.algorithm.config,
                                                    resolve=True)
        assert 'model' not in rllib_config, 'The config should be removed from the default yaml files since it will ' \
                                            'be dynamically written'
        assert self.num_workers == rllib_config['num_workers']
        rllib_config.update(maze_rllib_config)

        if rllib_config['seed'] is None:
            rllib_config[
                'seed'] = self.maze_seeding.generate_env_instance_seed()

        # Initialize ray with the passed ray_config parameters
        ray_config: Dict = OmegaConf.to_container(self.ray_config,
                                                  resolve=True)

        # Load tune parameters
        tune_config = OmegaConf.to_container(self.tune_params, resolve=True)
        tune_config['callbacks'] = [MazeRLlibSaveModelCallback()]

        # Start tune experiment
        assert 'config' not in tune_config, 'The config should be removed from the default yaml files since it will ' \
                                            'be dynamically written'

        self.ray_config = ray_config
        self.rllib_config = rllib_config
        self.tune_config = tune_config
Exemplo n.º 15
0
def persuasive_dqn_conf(rollout_size=10,
                        agents=100,
                        debug_folder=None,
                        eval_folder=None,
                        alpha=5e-4,
                        gamma=0.99):
    """
        https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/agents/trainer.py#L44
        https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/agents/dqn/dqn.py#L21
        https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/models/catalog.py#L37
    """

    # ModelCatalog.register_custom_model('custom_rrn', RNNModel)
    ModelCatalog.register_custom_action_dist('custom_action_distribution',
                                             PersuasiveActionDistribution)

    custom_configuration = dqn.DEFAULT_CONFIG.copy()

    custom_configuration['collect_metrics_timeout'] = 86400  # a day
    custom_configuration['framework'] = 'tf'
    custom_configuration['ignore_worker_failures'] = True
    custom_configuration['log_level'] = 'WARN'
    custom_configuration['monitor'] = True
    custom_configuration['num_cpus_for_driver'] = 1
    custom_configuration['num_cpus_per_worker'] = 1
    custom_configuration['num_envs_per_worker'] = 1
    custom_configuration['output'] = debug_folder
    custom_configuration['remote_env_batch_wait_ms'] = 1000
    custom_configuration['remote_worker_envs'] = False
    custom_configuration['seed'] = 42

    # === Parallelism ===
    # Number of workers for collecting samples with. This only makes sense
    # to increase if your environment is particularly slow to sample, or if
    # you"re using the Async or Ape-X optimizers.
    custom_configuration['num_workers'] = 0
    custom_configuration['num_gpus_per_worker'] = 1
    # Whether to compute priorities on workers.
    custom_configuration['worker_side_prioritization'] = False
    # Prevent iterations from going lower than this time span
    custom_configuration['min_iter_time_s'] = 1

    # === Environment Settings ===
    custom_configuration['batch_mode'] = 'complete_episodes'
    custom_configuration['callbacks'] = PersuasiveCallbacks
    custom_configuration['gamma'] = gamma
    custom_configuration['lr'] = alpha
    custom_configuration['no_done_at_end'] = False
    # Update the replay buffer with this many samples at once. Note that
    # this setting applies per-worker if num_workers > 1.
    custom_configuration['rollout_fragment_length'] = rollout_size
    # Size of a batch sampled from replay buffer for training. Note that
    # if async_updates is set, then each worker returns gradients for a
    # batch of this size.
    custom_configuration['train_batch_size'] = rollout_size * agents
    # If positive, input batches will be shuffled via a sliding window buffer
    # of this number of batches. Use this if the input data is not in random
    # enough order. Input is delayed until the shuffle buffer is filled.
    custom_configuration['shuffle_buffer_size'] = rollout_size * agents
    # Minimum env steps to optimize for per train call. This value does
    # not affect learning, only the length of train iterations.
    custom_configuration['timesteps_per_iteration'] = agents
    # How many steps of the model to sample before learning starts.
    custom_configuration['learning_starts'] = rollout_size * agents

    # === Exploration Settings ===
    custom_configuration['exploration_config'] = {}

    # # https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/utils/exploration/epsilon_greedy.py
    # custom_configuration['exploration_config']['type'] = 'EpsilonGreedy'
    # custom_configuration['exploration_config']['initial_epsilon'] = 1.0
    # custom_configuration['exploration_config']['final_epsilon'] = 0.02
    # custom_configuration['exploration_config']['epsilon_timesteps'] = 10000

    # https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/utils/exploration/soft_q.py
    custom_configuration['exploration_config']['type'] = 'SoftQ'
    custom_configuration['exploration_config']['temperature'] = 1.0  # Default

    # Name of a custom action distribution to use.
    # See: https://docs.ray.io/en/releases-1.0.0/rllib-models.html#custom-action-distributions
    # custom_configuration['model']['custom_action_dist'] = 'custom_action_distribution'

    # === DQN Model Settings ===
    # Update the target network every `target_network_update_freq` steps.
    # custom_configuration['target_network_update_freq'] = rollout_size
    custom_configuration['target_network_update_freq'] = agents
    # every agent should have done at least 1 action
    custom_configuration['n_step'] = 10

    # Number of atoms for representing the distribution of return. When
    # this is greater than 1, distributional Q-learning is used.
    # the discrete supports are bounded by v_min and v_max
    custom_configuration['num_atoms'] = 1
    custom_configuration['v_min'] = -10.0
    custom_configuration['v_max'] = 10.0
    # Whether to use noisy network
    custom_configuration['noisy'] = False
    # control the initial value of noisy nets
    custom_configuration['sigma0'] = 0.5
    # Whether to use dueling dqn
    custom_configuration['dueling'] = False  # True
    # Dense-layer setup for each the advantage branch and the value branch
    # in a dueling architecture.
    custom_configuration['hiddens'] = [256]
    # Whether to use double dqn
    custom_configuration['double_q'] = False  # True

    # === Replay buffer ===
    # Size of the replay buffer. Note that if async_updates is set, then
    # each worker will have a replay buffer of this size.
    custom_configuration['buffer_size'] = 10000  # 50000
    # If True prioritized replay buffer will be used.
    custom_configuration['prioritized_replay'] = False
    # Alpha parameter for prioritized replay buffer.
    custom_configuration['prioritized_replay_alpha'] = 0.6
    # Beta parameter for sampling from prioritized replay buffer.
    custom_configuration['prioritized_replay_beta'] = 0.4
    # Final value of beta (by default, we use constant beta=0.4).
    custom_configuration['final_prioritized_replay_beta'] = 0.4
    # Time steps over which the beta parameter is annealed.
    custom_configuration['prioritized_replay_beta_annealing_timesteps'] = 20000
    # Epsilon to add to the TD errors when updating priorities.
    custom_configuration['prioritized_replay_eps'] = 1e-6
    # Whether to LZ4 compress observations
    custom_configuration['compress_observations'] = False
    # Callback to run before learning on a multi-agent batch of experiences.
    custom_configuration['before_learn_on_batch'] = None
    # If set, this will fix the ratio of replayed from a buffer and learned on
    # timesteps to sampled from an environment and stored in the replay buffer
    # timesteps. Otherwise, the replay will proceed at the native ratio
    # determined by (train_batch_size / rollout_fragment_length).
    custom_configuration['training_intensity'] = None

    # === Optimization ===
    # Adam epsilon hyper parameter
    custom_configuration['adam_epsilon'] = 1e-8
    # If not None, clip gradients during optimization at this value
    custom_configuration['grad_clip'] = 40

    # === Evaluation Settings ===
    # Evaluate with every `evaluation_interval` training iterations.
    # The evaluation stats will be reported under the "evaluation" metric key.
    # Note that evaluation is currently not parallelized, and that for Ape-X
    # metrics are already only reported for the lowest epsilon workers.
    custom_configuration['evaluation_interval'] = 1
    # Number of episodes to run per evaluation period. If using multiple
    # evaluation workers, we will run at least this many episodes total.
    custom_configuration['evaluation_num_episodes'] = 5
    # Internal flag that is set to True for evaluation workers.
    # DEFAUTL: 'in_evaluation': False,
    # Typical usage is to pass extra args to evaluation env creator
    # and to disable exploration by computing deterministic actions.
    # IMPORTANT NOTE: Policy gradient algorithms are able to find the optimal
    # policy, even if this is a stochastic one. Setting 'explore=False' here
    # will result in the evaluation workers not using this optimal policy!
    custom_configuration['evaluation_config']['explore'] = False
    custom_configuration['evaluation_config']['lr'] = 0
    custom_configuration['evaluation_config']['num_gpus_per_worker'] = 0
    custom_configuration['evaluation_config']['num_gpus'] = 0
    custom_configuration['evaluation_config']['output'] = eval_folder
    # custom_configuration['evaluation_config']['env_config'] = {...},
    # Number of parallel workers to use for evaluation. Note that this is set
    # to zero by default, which means evaluation will be run in the trainer
    # process. If you increase this, it will increase the Ray resource usage
    # of the trainer since evaluation workers are created separately from
    # rollout workers.
    custom_configuration['evaluation_num_workers'] = 1
    # Customize the evaluation method. This must be a function of signature
    # (trainer: Trainer, eval_workers: WorkerSet) -> metrics: dict. See the
    # Trainer._evaluate() method to see the default implementation. The
    # trainer guarantees all eval workers have the latest policy state before
    # this function is called.
    custom_configuration['custom_eval_function'] = None  #custom_eval_function

    return custom_configuration
Exemplo n.º 16
0
        high_obs = np.concatenate([high_obs, high_obs_fill])
        observation_space_multi = gym.spaces.Box(low=low_obs,
                                                 high=high_obs,
                                                 dtype=np.float32)
        action_space_multi = gym.spaces.Discrete(15)

    # Register env's and custom stuff to RLlib for trainer to be able to use them.
    register_env(
        "BlueSkySrv", lambda env_config: BlueSkyServerMultiAgent(
            action_space_multi, observation_space_multi, settings.
            max_concurrent, env_config))
    ModelCatalog.register_custom_model("Centralized", MyModelCentralized)
    ModelCatalog.register_custom_model("CentralizedLSTM", MyModelCentralized2)
    # ModelCatalog.register_custom_action_dist("BetaDistributionAction", BetaDistributionAction)
    # ModelCatalog.register_custom_action_dist("CategoricalOrdinal", CategoricalOrdinal)
    ModelCatalog.register_custom_action_dist("CategoricalOrdinalTFP",
                                             CategoricalOrdinalTFP)

    # Init ray.
    ray.init()

    # def explore(config):
    #     # ensure we collect enough timesteps to do sgd
    #     if config["train_batch_size"] < config["sgd_minibatch_size"] * 2:
    #         config["train_batch_size"] = config["sgd_minibatch_size"] * 2
    #     # ensure we run at least one sgd iter
    #     if config["num_sgd_iter"] < 1:
    #         config["num_sgd_iter"] = 1
    #     return config
    #
    # pbt = PopulationBasedTraining(
    #     time_attr="time_total_s",
Exemplo n.º 17
0
        )  # batch_size x num_gaussians x action_dim
        cat_samples = self.cat.sample()  # batch_size
        # First we need to expand cat so that it has the same dimension as normal samples
        cat_samples = cat_samples.view(-1, 1,
                                       1).expand(-1, -1, self.action_dim)
        # We select the normal distribution based on the outputs of
        # the categorical distribution
        self.last_sample = torch.gather(normal_samples, 1,
                                        cat_samples).squeeze(
                                            dim=1)  # batch_size x action_dim
        assert len(
            self.last_sample.shape) == 2, f"shape, {self.last_sample.shape}"
        return self.last_sample


ModelCatalog.register_custom_action_dist("gmm",
                                         TorchGaussianMixtureDistribution)


class TorchFlowDistribution(TorchDistributionWrapper):
    # https://github.com/ray-project/ray/blob/be62444bc5924c61d69bb6aec62f967e531e768c/rllib/examples/models/autoregressive_action_dist.py
    @staticmethod
    def required_model_output_shape(action_space, model_config):
        return prod(action_space.shape)

    def __init__(self, inputs: torch.Tensor, model: NormalizingFlowsPolicy):
        super(TorchDistributionWrapper, self).__init__(inputs, model)
        self.model = model
        self.batch_size, self.action_dim = inputs.shape
        self.device = inputs.device
        self.base_dist = Normal(
            torch.zeros(self.batch_size, self.action_dim, device=self.device),
Exemplo n.º 18
0
                                           [a1_logits, a2_logits])
        self.action_model.summary()
        self.register_variables(self.action_model.variables)

    def forward(self, input_dict, state, seq_lens):
        context, self._value_out = self.base_model(input_dict["obs"])
        return context, state

    def value_function(self):
        return tf.reshape(self._value_out, [-1])


if __name__ == "__main__":
    ray.init()
    args = parser.parse_args()
    ModelCatalog.register_custom_model("autoregressive_model",
                                       AutoregressiveActionsModel)
    ModelCatalog.register_custom_action_dist("binary_autoreg_output",
                                             BinaryAutoregressiveOutput)
    tune.run(args.run,
             stop={"episode_reward_mean": args.stop},
             config={
                 "env": CorrelatedActionsEnv,
                 "gamma": 0.5,
                 "num_gpus": 0,
                 "model": {
                     "custom_model": "autoregressive_model",
                     "custom_action_dist": "binary_autoreg_output",
                 },
             })
Exemplo n.º 19
0
        t1 = q.concentration1.lgamma() + q.concentration0.lgamma() + (
            sum_params_p).lgamma()
        t2 = p.concentration1.lgamma() + p.concentration0.lgamma() + (
            sum_params_q).lgamma()
        t3 = (p.concentration1 - q.concentration1) * torch.digamma(
            p.concentration1)
        t4 = (p.concentration0 - q.concentration0) * torch.digamma(
            p.concentration0)
        t5 = (sum_params_q - sum_params_p) * torch.digamma(sum_params_p)
        return (t1 - t2 + t3 + t4 + t5).sum(-1)

    def entropy(self):
        return self.dist.entropy().sum(-1)


ModelCatalog.register_custom_action_dist("mydist", MyDist)


########### Do Training #################
def main():
    ray.init()

    #  Hyperparameters of PPO are not well tuned. Most of them refer to https://github.com/xtma/pytorch_car_caring/blob/master/train.py
    trainer = PPOTrainer(env="myenv",
                         config={
                             "use_pytorch": True,
                             "model": {
                                 "custom_model": "mymodel",
                                 "custom_options": {
                                     'encoder_path': args.encoder_path,
                                     'train_encoder': args.train_encoder,
        super(DiscreteActionDistribution, self).__init__(inputs, model)
        self._dist = tfd.Categorical(logits=self.inputs,
                                     validate_args=True,
                                     allow_nan_stats=False)

    def sample(self):
        sample = self._dist.sample()
        self._last_sample_logp = self._dist.log_prob(sample)
        return sample

    def logp(self, action):
        action = tf.cast(action, tf.int32)
        return self._dist.log_prob(action)

    def sampled_action_logp(self):
        return self._last_sample_logp

    def entropy(self):
        return self._dist.entropy()

    def kl(self, other):
        """
        Args:
            other: another DiscreteActionDistribution instance
        Returns: KL-Divergence between this distribution and other
        """
        return self._dist.kl_divergence(other._dist)


ModelCatalog.register_custom_action_dist('discrete_action_distribution',
                                         DiscreteActionDistribution)
Exemplo n.º 21
0
        return [
            d.log_prob(a) for (a, d) in zip(action_parts, self._distributions)
        ]

    def _logp(self, action_parts):
        # print('action_parts:', action_parts)
        # print('self._logp_parts(action_parts):', self._logp_parts(action_parts))
        logp_parts = self._logp_parts(action_parts)
        total_logp = 0
        for term in logp_parts:
            total_logp += term
        return term
        #return tf.reduce_sum(tf.concat(self._logp_parts(action_parts), axis=-1), axis=-1)

    def _extract_action_parts(self, flat_action):
        sample_parts = []
        next_free_idx = 0
        for d in self._distributions:
            start_idx = next_free_idx
            next_free_idx += d.flat_sample_size()
            flat_sample = flat_action[..., start_idx:next_free_idx]
            shaped_sample = d.flat_to_event_shape(flat_sample)
            # print('Extracting action for distribution (dist: {}, start_idx: {}, next_free_idx: {}, flat_sample: {}, shaped_sample: {}' \
            # .format(d, start_idx, next_free_idx, flat_sample, shaped_sample))
            sample_parts.append(shaped_sample)
        return sample_parts


ModelCatalog.register_custom_action_dist(
    'categorical_gaussian_diag_action_dist', CategoricalGaussianDiagActionDist)
Exemplo n.º 22
0
            'log_std_range': args.log_std_range
        }
        if use_keras_model:
            for key in [
                    'fcnet_hiddens', 'fcnet_activation', 'post_fcnet_hiddens',
                    'post_fcnet_activation', 'no_final_layer',
                    'vf_share_layers', 'free_log_std'
            ]:
                if key in config['model']:
                    config['model']['custom_model_config'][key] = config[
                        'model'][key]

    if args.action_distribution is not None:
        if args.action_distribution == 'truncated_normal':
            from model.custom_action_dist import TruncatedNormal
            ModelCatalog.register_custom_action_dist("truncated_normal",
                                                     TruncatedNormal)
            config['model']['custom_action_dist'] = 'truncated_normal'
        if args.action_distribution == 'truncated_normal_zero_kl':
            from model.custom_action_dist import TruncatedNormalZeroKL
            ModelCatalog.register_custom_action_dist(
                "truncated_normal_zero_kl", TruncatedNormalZeroKL)
            config['model']['custom_action_dist'] = 'truncated_normal_zero_kl'
        if args.action_distribution == 'beta_alpha_beta':
            from model.custom_action_dist import BetaAlphaBeta
            ModelCatalog.register_custom_action_dist("beta_alpha_beta",
                                                     BetaAlphaBeta)
            config['model']['custom_action_dist'] = 'beta_alpha_beta'

    config.update(env=env_name)

    if args.checkpoint is not None:
Exemplo n.º 23
0
parser.add_argument("--run", type=str, default="PPO")  # try PG, PPO, IMPALA
parser.add_argument("--torch", action="store_true")
parser.add_argument("--num-cpus", type=int, default=0)
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--stop-iters", type=int, default=200)
parser.add_argument("--stop-timesteps", type=int, default=100000)
parser.add_argument("--stop-reward", type=float, default=200)

if __name__ == "__main__":
    args = parser.parse_args()
    ray.init(num_cpus=args.num_cpus or None)
    ModelCatalog.register_custom_model(
        "autoregressive_model", TorchAutoregressiveActionModel
        if args.torch else AutoregressiveActionModel)
    ModelCatalog.register_custom_action_dist(
        "binary_autoreg_dist", TorchBinaryAutoregressiveDistribution
        if args.torch else BinaryAutoregressiveDistribution)

    config = {
        "env": CorrelatedActionsEnv,
        "gamma": 0.5,
        "num_gpus": 0,
        "model": {
            "custom_model": "autoregressive_model",
            "custom_action_dist": "binary_autoreg_dist",
        },
        "use_pytorch": args.torch,
    }

    stop = {
        "training_iteration": args.stop_iters,
Exemplo n.º 24
0

if __name__ == "__main__":
    args = get_cli_args()
    ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode)

    # main part: register and configure autoregressive action model and dist
    # here, tailored to the CorrelatedActionsEnv such that a2 depends on a1
    ModelCatalog.register_custom_model(
        "autoregressive_model",
        TorchAutoregressiveActionModel
        if args.framework == "torch" else AutoregressiveActionModel,
    )
    ModelCatalog.register_custom_action_dist(
        "binary_autoreg_dist",
        TorchBinaryAutoregressiveDistribution
        if args.framework == "torch" else BinaryAutoregressiveDistribution,
    )

    # standard config
    config = {
        "env": CorrelatedActionsEnv,
        "gamma": 0.5,
        # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0.
        "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")),
        "framework": args.framework,
    }
    # use registered model and dist in config
    if not args.no_autoreg:
        config["model"] = {
            "custom_model": "autoregressive_model",
Exemplo n.º 25
0
from ray.rllib.models import ModelCatalog
from ray.tune.registry import register_env

from .model import ReallocationModel, Dirichlet
from .env import create_env


register_env("TradingEnv", create_env)
ModelCatalog.register_custom_action_dist("dirichlet", Dirichlet)
ModelCatalog.register_custom_model("reallocate", ReallocationModel)