def test_custom_multi_action_distribution(self): class Model: pass ray.init( object_store_memory=1000 * 1024 * 1024, ignore_reinit_error=True ) # otherwise fails sometimes locally # registration ModelCatalog.register_custom_action_dist("test", CustomMultiActionDistribution) s1 = Discrete(5) s2 = Box(0, 1, shape=(3,), dtype=np.float32) spaces = dict(action_1=s1, action_2=s2) action_space = Dict(spaces) # test retrieving it model_config = MODEL_DEFAULTS.copy() model_config["custom_action_dist"] = "test" dist_cls, param_shape = ModelCatalog.get_action_dist(action_space, model_config) self.assertIsInstance(dist_cls, partial) self.assertEqual(param_shape, s1.n + 2 * s2.shape[0]) # test the class works as a distribution dist_input = tf1.placeholder(tf.float32, (None, param_shape)) model = Model() model.model_config = model_config dist = dist_cls(dist_input, model=model) self.assertIsInstance(dist.sample(), dict) self.assertIn("action_1", dist.sample()) self.assertIn("action_2", dist.sample()) self.assertEqual(dist.sample()["action_1"].dtype, tf.int64) self.assertEqual(dist.sample()["action_2"].shape[1:], s2.shape) with self.assertRaises(NotImplementedError): dist.entropy()
def testCustomActionDistribution(self): ray.init() # registration ModelCatalog.register_custom_action_dist("test", CustomActionDistribution) action_space = Box(0, 1, shape=(5, 3), dtype=np.float32) # test retrieving it model_config = MODEL_DEFAULTS.copy() model_config["custom_action_dist"] = "test" dist_cls, param_shape = ModelCatalog.get_action_dist( action_space, model_config) self.assertEqual(str(dist_cls), str(CustomActionDistribution)) self.assertEqual(param_shape, action_space.shape) # test the class works as a distribution dist_input = tf.placeholder(tf.float32, (None, ) + param_shape) dist = dist_cls(dist_input, model_config=model_config) self.assertEqual(dist.sample().shape[1:], dist_input.shape[1:]) self.assertIsInstance(dist.sample(), tf.Tensor) with self.assertRaises(NotImplementedError): dist.entropy() # test passing the options to it model_config["custom_options"].update({"output_dim": (3, )}) dist_cls, param_shape = ModelCatalog.get_action_dist( action_space, model_config) self.assertEqual(param_shape, (3, )) dist_input = tf.placeholder(tf.float32, (None, ) + param_shape) dist = dist_cls(dist_input, model_config=model_config) self.assertEqual(dist.sample().shape[1:], dist_input.shape[1:]) self.assertIsInstance(dist.sample(), tf.Tensor) with self.assertRaises(NotImplementedError): dist.entropy()
def initialize(): ray.init() register_env("coverage", lambda config: CoverageEnv(config)) register_env("path_planning", lambda config: PathPlanningEnv(config)) ModelCatalog.register_custom_model("adversarial", AdversarialModel) ModelCatalog.register_custom_action_dist( "hom_multi_action", TorchHomogeneousMultiActionDistribution)
def register_mixture_action_distribution(): ModelCatalog.register_custom_action_dist(GaussianMixture.name, GaussianMixture) ModelCatalog.register_custom_action_dist(DeterministicMixture.name, DeterministicMixture) print("Successfully register GaussianMixture and DeterministicMixture " "action distribution.")
def __init__(self, load_path, algorithm, policy_name, observation_space, action_space): self._checkpoint_path = load_path self._policy_name = policy_name self._observation_space = observation_space self._action_space = action_space self._prep = ModelCatalog.get_preprocessor_for_space(self._observation_space) flat_obs_space = self._prep.observation_space ray.init(ignore_reinit_error=True, local_mode=True) from utils.ppo_policy import PPOTorchPolicy as LoadPolicy ModelCatalog.register_custom_action_dist("my_dist", TorchRepeatDistribution) config = ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG.copy() config['num_workers'] = 0 config["model"]["use_lstm"] = True config['model']['free_log_std'] = False config["model"]["custom_action_dist"] = "my_dist" self.policy = LoadPolicy(flat_obs_space, self._action_space, config) objs = pickle.load(open(self._checkpoint_path, "rb")) objs = pickle.loads(objs["worker"]) state = objs["state"] filters = objs["filters"] self.filters = filters[self._policy_name] weights = state[self._policy_name] weights.pop("_optimizer_variables") self.policy.set_weights(weights) self.model = self.policy.model self.rnn_state = self.model.get_initial_state() self.rnn_state = [self.rnn_state[0].unsqueeze(0),self.rnn_state[1].unsqueeze(0)]
def setup_ma_config(config, create_env): env = create_env(config['env_config']) policies_to_train = ['agent'] num_adversaries = config['env_config']['num_adv_strengths'] * config['env_config']['advs_per_strength'] if num_adversaries == 0: return adv_policies = ['adversary' + str(i) for i in range(num_adversaries)] adversary_config = {"model": {'fcnet_hiddens': [64, 64], 'use_lstm': False}, "entropy_coeff": config['env_config']['entropy_coeff']} if config['env_config']['run'] == 'PPO': if config['env_config']['kl_reward']: ModelCatalog.register_custom_action_dist("logits_dist", LogitsDist) adversary_config['model']['custom_action_dist'] = "logits_dist" # for both of these we need a graph that zeros out agents that weren't active if config['env_config']['kl_reward'] or (config['env_config']['l2_reward'] and not config['env_config']['l2_memory']): policy_graphs = {'agent': (PPOTFPolicy, env.observation_space, env.action_space, {})} policy_graphs.update({adv_policies[i]: (CustomPPOPolicy, env.adv_observation_space, env.adv_action_space, adversary_config) for i in range(num_adversaries)}) else: policy_graphs = {'agent': (PPOTFPolicy, env.observation_space, env.action_space, {})} policy_graphs.update({adv_policies[i]: (PPOTFPolicy, env.adv_observation_space, env.adv_action_space, adversary_config) for i in range(num_adversaries)}) elif config['env_config']['run'] == 'TD3': policy_graphs = {'agent': (DDPGTFPolicy, env.observation_space, env.action_space, {})} policy_graphs.update({adv_policies[i]: (DDPGTFPolicy, env.adv_observation_space, env.adv_action_space, adversary_config) for i in range(num_adversaries)}) # policy_graphs.update({adv_policies[i]: (CustomPPOPolicy, env.adv_observation_space, # env.adv_action_space, adversary_config) for i in range(num_adversaries)}) print("========= Policy Graphs ==========") print(policy_graphs) policies_to_train += adv_policies def policy_mapping_fn(agent_id): return agent_id config.update({ 'multiagent': { 'policies': policy_graphs, 'policy_mapping_fn': policy_mapping_fn, 'policies_to_train': policies_to_train } }) print({'multiagent': { 'policies': policy_graphs, 'policy_mapping_fn': policy_mapping_fn, 'policies_to_train': policies_to_train }})
def test_custom_action_distribution(self): class Model(): pass ray.init(object_store_memory=1000 * 1024 * 1024, ignore_reinit_error=True) # otherwise fails sometimes locally # registration ModelCatalog.register_custom_action_dist("test", CustomActionDistribution) action_space = Box(0, 1, shape=(5, 3), dtype=np.float32) # test retrieving it model_config = MODEL_DEFAULTS.copy() model_config["custom_action_dist"] = "test" dist_cls, param_shape = ModelCatalog.get_action_dist( action_space, model_config) self.assertEqual(str(dist_cls), str(CustomActionDistribution)) self.assertEqual(param_shape, action_space.shape) # test the class works as a distribution dist_input = tf1.placeholder(tf.float32, (None, ) + param_shape) model = Model() model.model_config = model_config dist = dist_cls(dist_input, model=model) self.assertEqual(dist.sample().shape[1:], dist_input.shape[1:]) self.assertIsInstance(dist.sample(), tf.Tensor) with self.assertRaises(NotImplementedError): dist.entropy() # test passing the options to it model_config["custom_model_config"].update({"output_dim": (3, )}) dist_cls, param_shape = ModelCatalog.get_action_dist( action_space, model_config) self.assertEqual(param_shape, (3, )) dist_input = tf1.placeholder(tf.float32, (None, ) + param_shape) model.model_config = model_config dist = dist_cls(dist_input, model=model) self.assertEqual(dist.sample().shape[1:], dist_input.shape[1:]) self.assertIsInstance(dist.sample(), tf.Tensor) with self.assertRaises(NotImplementedError): dist.entropy()
def persuasive_a3c_conf(rollout_size=10, agents=100, debug_folder=None, eval_folder=None, alpha=0.0001, gamma=0.99): """ https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/agents/trainer.py#L44 https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/agents/a3c/a3c.py#L14 https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/models/catalog.py#L37 """ ModelCatalog.register_custom_model('custom_rrn', RNNModel) ModelCatalog.register_custom_action_dist( "custom_action_distribution", PersuasiveActionDistribution) custom_configuration = DEFAULT_CONFIG custom_configuration['collect_metrics_timeout'] = 86400 # a day custom_configuration['framework'] = 'tf' custom_configuration['ignore_worker_failures'] = True custom_configuration['log_level'] = 'WARN' custom_configuration['monitor'] = True custom_configuration['num_cpus_for_driver'] = 1 custom_configuration['num_cpus_per_worker'] = 1 custom_configuration['num_envs_per_worker'] = 1 custom_configuration['num_gpus_per_worker'] = 1 custom_configuration['num_gpus'] = 1 custom_configuration['num_workers'] = 1 custom_configuration['output'] = debug_folder custom_configuration['remote_env_batch_wait_ms'] = 1000 custom_configuration['remote_worker_envs'] = False custom_configuration['seed'] = 42 custom_configuration['timesteps_per_iteration'] = 1 # === Environment Settings === custom_configuration['batch_mode'] = 'complete_episodes' custom_configuration['gamma'] = gamma custom_configuration['lr'] = alpha custom_configuration['no_done_at_end'] = False # Divide episodes into fragments of this many steps each during rollouts. # Sample batches of this size are collected from rollout workers and # combined into a larger batch of `train_batch_size` for learning. # For example, given rollout_fragment_length=100 and train_batch_size=1000: # 1. RLlib collects 10 fragments of 100 steps each from rollout workers. # 2. These fragments are concatenated and we perform an epoch of SGD. # When using multiple envs per worker, the fragment size is multiplied by # `num_envs_per_worker`. This is since we are collecting steps from # multiple envs in parallel. For example, if num_envs_per_worker=5, then # rollout workers will return experiences in chunks of 5*100 = 500 steps. # The dataflow here can vary per algorithm. For example, PPO further # divides the train batch into minibatches for multi-epoch SGD. custom_configuration['rollout_fragment_length'] = rollout_size # Training batch size, if applicable. Should be >= rollout_fragment_length. # Samples batches will be concatenated together to a batch of this size, # which is then passed to SGD. custom_configuration['train_batch_size'] = rollout_size * agents # === Exploration Settings === # https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/utils/exploration/stochastic_sampling.py # custom_configuration['exploration_config']['type'] = 'StochasticSampling' # https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/utils/exploration/epsilon_greedy.py custom_configuration['exploration_config']['type'] = 'EpsilonGreedy' custom_configuration['exploration_config']['initial_epsilon'] = 1.0 custom_configuration['exploration_config']['final_epsilon'] = 0.0001 # ==================== MODEL - DEFAULT ==================== # custom_configuration['model']['fcnet_hiddens'] = [64, 64] # === Built-in options === # Filter config. List of [out_channels, kernel, stride] for each filter # custom_configuration['model']['conv_filters'] = None # Nonlinearity for built-in convnet # custom_configuration['model']['conv_activation'] = "relu" # Nonlinearity for fully connected net (tanh, relu) # custom_configuration['model']['fcnet_activation'] = "tanh" # Number of hidden layers for fully connected net # custom_configuration['model']['fcnet_hiddens'] = [64, 64] # For DiagGaussian action distributions, make the second half of the model # outputs floating bias variables instead of state-dependent. This only # has an effect is using the default fully connected net. # custom_configuration['model']['free_log_std'] = False # Whether to skip the final linear layer used to resize the hidden layer # outputs to size `num_outputs`. If True, then the last hidden layer # should already match num_outputs. # custom_configuration['model']['no_final_linear'] = False # Whether layers should be shared for the value function. # custom_configuration['model']['vf_share_layers'] = True # == LSTM == # Whether to wrap the model with an LSTM. # custom_configuration['model']['use_lstm'] = False # Max seq len for training the LSTM, defaults to 20. # custom_configuration['model']['max_seq_len'] = 20 # Size of the LSTM cell. # custom_configuration['model']['lstm_cell_size'] = 64 # Whether to feed a_{t-1}, r_{t-1} to LSTM. # custom_configuration['model']['lstm_use_prev_action_reward'] = False # When using modelv1 models with a modelv2 algorithm, you may have to # define the state shape here (e.g., [256, 256]). # custom_configuration['model']['state_shape'] = None # [64, 64] # == Atari == # Whether to enable framestack for Atari envs # custom_configuration['model']['framestack'] = True # Final resized frame dimension # custom_configuration['model']['dim'] = 84 # (deprecated) Converts ATARI frame to 1 Channel Grayscale image # custom_configuration['model']['grayscale'] = False # (deprecated) Changes frame to range from [-1, 1] if true # custom_configuration['model']['zero_mean'] = True # === Options for custom models === # Name of a custom model to use custom_configuration['model']['custom_model'] = 'custom_rrn' # Extra options to pass to the custom classes. # These will be available in the Model's custom_configuration['model']['custom_model_config'] = {} # Name of a custom action distribution to use. # See: https://docs.ray.io/en/releases-1.0.0/rllib-models.html#custom-action-distributions custom_configuration['model']['custom_action_dist'] = 'custom_action_distribution' # == OPTIMIZER == # Arguments to pass to the policy optimizer. These vary by optimizer. # custom_configuration['optimizer'] = {} # == Persuasive A3C == custom_configuration['callbacks'] = PersuasiveCallbacks custom_configuration['min_iter_time_s'] = 5 custom_configuration['use_gae'] = True # === Evaluation Settings === # Evaluate with every `evaluation_interval` training iterations. # The evaluation stats will be reported under the "evaluation" metric key. # Note that evaluation is currently not parallelized, and that for Ape-X # metrics are already only reported for the lowest epsilon workers. custom_configuration['evaluation_interval'] = 5 # Number of episodes to run per evaluation period. If using multiple # evaluation workers, we will run at least this many episodes total. custom_configuration['evaluation_num_episodes'] = 5 # Internal flag that is set to True for evaluation workers. # DEFAUTL: 'in_evaluation': False, # Typical usage is to pass extra args to evaluation env creator # and to disable exploration by computing deterministic actions. # IMPORTANT NOTE: Policy gradient algorithms are able to find the optimal # policy, even if this is a stochastic one. Setting 'explore=False' here # will result in the evaluation workers not using this optimal policy! custom_configuration['evaluation_config']['explore'] = False custom_configuration['evaluation_config']['lr'] = 0 custom_configuration['evaluation_config']['num_gpus_per_worker'] = 0 custom_configuration['evaluation_config']['num_gpus'] = 0 custom_configuration['evaluation_config']['output'] = eval_folder # custom_configuration['evaluation_config']['env_config'] = {...}, # Number of parallel workers to use for evaluation. Note that this is set # to zero by default, which means evaluation will be run in the trainer # process. If you increase this, it will increase the Ray resource usage # of the trainer since evaluation workers are created separately from # rollout workers. custom_configuration['evaluation_num_workers'] = 1 # Customize the evaluation method. This must be a function of signature # (trainer: Trainer, eval_workers: WorkerSet) -> metrics: dict. See the # Trainer._evaluate() method to see the default implementation. The # trainer guarantees all eval workers have the latest policy state before # this function is called. custom_configuration['custom_eval_function'] = None #custom_eval_function return custom_configuration
# "shuffle_sequences": True, # "num_sgd_iter": 15, # "lr": 5e-5, # "vf_share_layers": True, # "vf_loss_coeff": 0.5, # "entropy_coeff": 0.001, # "entropy_coeff_schedule": None, # "clip_param": 0.2, # "kl_target": 0.01, # "grad_clip": 5.0, # "gamma": 0.999, # "sample_batch_size": 128, # "train_batch_size": 1024 # } # config.update(ppo_conf) ModelCatalog.register_custom_action_dist("my_dist", MultiOrdinal) config["num_gpus"] = args.ngpu # used for trainer process config["num_workers"] = args.ncpu config['num_envs_per_worker'] = 1 config['env_config'] = envconf config['horizon'] = envconf['max_steps'] config['model'] = { "custom_model": 'deepdrug3d', 'custom_action_dist': 'my_dist' } # trainer = ppo.PPOTrainer(config=config, env='lactamase_docking') # # trainer.restore('/homes/aclyde11/ray_results/PPO_lactamase_docking_2019-11-22_16-34-28igjfjjyh/checkpoint_1052/checkpoint-1052') # policy = trainer.get_policy() # print(policy.model.base_model.summary()) #
def persuasive_ppo_conf(rollout_size=10, agents=100, true_obs_shape=None, action_embed_size=None, debug_folder=None, eval_folder=None, alpha=5e-5, gamma=0.99): """ https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/agents/trainer.py#L44 https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/agents/ppo/ppo.py#L15 https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/models/catalog.py#L37 """ ModelCatalog.register_custom_action_dist('custom_action_distribution', PersuasiveActionDistribution) ModelCatalog.register_custom_model('custom_model', OwnershipActionMaskingModel) custom_configuration = ppo.DEFAULT_CONFIG.copy() # custom_configuration['collect_metrics_timeout'] = 86400 # a day custom_configuration['framework'] = 'tf' custom_configuration['ignore_worker_failures'] = True custom_configuration['log_level'] = 'WARN' custom_configuration['monitor'] = True custom_configuration['num_gpus'] = 0 custom_configuration['num_cpus_for_driver'] = 1 custom_configuration['num_cpus_per_worker'] = 1 custom_configuration['num_envs_per_worker'] = 1 # custom_configuration['output'] = debug_folder # custom_configuration['remote_env_batch_wait_ms'] = 1000 # custom_configuration['remote_worker_envs'] = False custom_configuration['seed'] = 42 # === Parallelism === # Number of workers for collecting samples with. = deepcopy(policy_conf) # trainer.setup(dThis only makes sense # to increase if your environment is particularly slow to sample, or if # you"re using the Async or Ape-X optimizers. custom_configuration['num_workers'] = 1 custom_configuration['num_gpus_per_worker'] = 0 # Prevent iterations from going lower than this time span # custom_configuration['min_iter_time_s'] = 1 # === Environment Settings === custom_configuration['batch_mode'] = 'complete_episodes' custom_configuration['callbacks'] = PersuasiveCallbacks custom_configuration['gamma'] = gamma custom_configuration['lr'] = alpha custom_configuration['lr_schedule'] = None custom_configuration['no_done_at_end'] = False # === Exploration Settings === # custom_configuration['exploration_config'] = {} # # https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/utils/exploration/epsilon_greedy.py # custom_configuration['exploration_config']['type'] = 'EpsilonGreedy' # custom_configuration['exploration_config']['initial_epsilon'] = 1.0 # custom_configuration['exploration_config']['final_epsilon'] = 0.02 # custom_configuration['exploration_config']['epsilon_timesteps'] = 10000 # https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/utils/exploration/soft_q.py # custom_configuration['exploration_config']['type'] = 'SoftQ' # custom_configuration['exploration_config']['temperature'] = 1.0 # Default # Name of a custom action distribution to use. # See: https://docs.ray.io/en/releases-1.0.0/rllib-models.html#custom-action-distributions # custom_configuration['model']['custom_action_dist'] = 'custom_action_distribution' # === PPO Model Settings === # Should use a critic as a baseline (otherwise don't use value baseline; # required for using GAE). custom_configuration['use_critic'] = True # If true, use the Generalized Advantage Estimator (GAE) # with a value function, see https://arxiv.org/pdf/1506.02438.pdf. custom_configuration['use_gae'] = True # The GAE(lambda) parameter. custom_configuration['lambda'] = 1.0 # Initial coefficient for KL divergence. custom_configuration['kl_coeff'] = 0.2 # Size of batches collected from each worker. custom_configuration['rollout_fragment_length'] = 10 #100 # Number of timesteps collected for each SGD round. This defines the size # of each SGD epoch. custom_configuration['train_batch_size'] = 100 #1000 # Total SGD batch size across all devices for SGD. This defines the # minibatch size within each epoch. custom_configuration['sgd_minibatch_size'] = 64 #128 #64 # Whether to shuffle sequences in the batch when training (recommended). custom_configuration['shuffle_sequences'] = True # Number of SGD iterations in each outer loop (i.e., number of epochs to # execute per train batch). custom_configuration['num_sgd_iter'] = 30 # Share layers for value function. If you set this to True, it's important # to tune vf_loss_coeff. custom_configuration['vf_share_layers'] = False # Coefficient of the value function loss. IMPORTANT: you must tune this if # you set vf_share_layers: True. custom_configuration['vf_loss_coeff'] = 1.0 # Coefficient of the entropy regularizer. custom_configuration['entropy_coeff'] = 0.0 # Decay schedule for the entropy regularizer. custom_configuration['entropy_coeff_schedule'] = None # PPO clip parameter. custom_configuration['clip_param'] = 0.3 # Clip param for the value function. Note that this is sensitive to the # scale of the rewards. If your expected V is large, increase this. custom_configuration['vf_clip_param'] = 10.0 # If specified, clip the global norm of gradients by this amount. custom_configuration['grad_clip'] = None # Target value for KL divergence. custom_configuration['kl_target'] = 0.01 # Which observation filter to apply to the observation. custom_configuration['observation_filter'] = "NoFilter" # Uses the sync samples optimizer instead of the multi-gpu one. This is # usually slower, but you might want to try it if you run into issues with # the default optimizer. custom_configuration['simple_optimizer'] = False # Whether to fake GPUs (using CPUs). # Set this to True for debugging on non-GPU machines (set `num_gpus` > 0). custom_configuration['_fake_gpus'] = False # === MODEL === custom_configuration['model']['use_lstm'] = False custom_configuration['model']['custom_model'] = 'custom_model' custom_configuration['model']['custom_model_config'][ 'true_obs_shape'] = true_obs_shape custom_configuration['model']['custom_model_config'][ 'action_embed_size'] = action_embed_size # === Evaluation Settings === # Evaluate with every `evaluation_interval` training iterations. # The evaluation stats will be reported under the "evaluation" metric key. # Note that evaluation is currently not parallelized, and that for Ape-X # metrics are already only reported for the lowest epsilon workers. custom_configuration['evaluation_interval'] = 1 # Number of episodes to run per evaluation period. If using multiple # evaluation workers, we will run at least this many episodes total. custom_configuration['evaluation_num_episodes'] = 5 # Internal flag that is set to True for evaluation workers. # DEFAUTL: 'in_evaluation': False, # Typical usage is to pass extra args to evaluation env creator # and to disable exploration by computing deterministic actions. # IMPORTANT NOTE: Policy gradient algorithms are able to find the optimal # policy, even if this is a stochastic one. Setting 'explore=False' here # will result in the evaluation workers not using this optimal policy! custom_configuration['evaluation_config']['explore'] = True custom_configuration['evaluation_config']['lr'] = 0 custom_configuration['evaluation_config']['num_gpus_per_worker'] = 0 custom_configuration['evaluation_config']['num_gpus'] = 0 # custom_configuration['evaluation_config']['output'] = eval_folder # custom_configuration['evaluation_config']['env_config'] = {...}, # Number of parallel workers to use for evaluation. Note that this is set # to zero by default, which means evaluation will be run in the trainer # process. If you increase this, it will increase the Ray resource usage # of the trainer since evaluation workers are created separately from # rollout workers. custom_configuration['evaluation_num_workers'] = 1 # Customize the evaluation method. This must be a function of signature # (trainer: Trainer, eval_workers: WorkerSet) -> metrics: dict. See the # Trainer._evaluate() method to see the default implementation. The # trainer guarantees all eval workers have the latest policy state before # this function is called. custom_configuration['custom_eval_function'] = None #custom_eval_function return custom_configuration
"receiver") class DeterministicSenderDist(DeterministicMessageActionDistribution): def __init__(self, inputs, model): super().__init__(inputs, model, sample_env.action_space["sender"], "sender") class DeterministicReceiverDist(DeterministicMessageActionDistribution): def __init__(self, inputs, model): super().__init__(inputs, model, sample_env.action_space["receiver"], "receiver") ModelCatalog.register_custom_action_dist("sender_dist", SenderDist) ModelCatalog.register_custom_action_dist("receiver_dist", ReceiverDist) ModelCatalog.register_custom_action_dist("deterministic_sender_dist", DeterministicSenderDist) ModelCatalog.register_custom_action_dist("deterministic_receiver_dist", DeterministicReceiverDist) configs = [] default_config = { # "run": "IMPALA", "run": ImpalaCPCSaTrainer, "tot_steps": 1e9, "max_workers":
) parser = argparse.ArgumentParser() parser.add_argument('--run', type=str, default='PG') # A3C parser.add_argument('--stop', type=int, default=180) if __name__ == '__main__': args = parser.parse_args() ray.init() register_env('cart_poles_env', lambda env_config: CartPolesEnv(env_config)) register_env('cart_poles_stacked_env', lambda env_config: CartPolesStackedEnv(env_config)) ModelCatalog.register_custom_model('cart_poles_model', CartPolesModel) ModelCatalog.register_custom_model('cart_poles_stacked_model', CartPolesStackedModel) ModelCatalog.register_custom_action_dist('cart_poles_action_dist', CartPolesActionDist) tune.run( args.run, stop={'episode_reward_mean': args.stop}, config={ 'env': 'cart_poles_stacked_env', # 'gamma': 0.99, 'num_workers': 3, 'model': { 'custom_model': 'cart_poles_stacked_model', 'custom_action_dist': 'cart_poles_action_dist', }, })
def main(args): # ==================================== # init env config # ==================================== if args.no_debug: ray.init(webui_host="127.0.0.1") else: ray.init(local_mode=True, webui_host="127.0.0.1") # use ray cluster for training # ray.init( # address="auto" if args.address is None else args.address, # redis_password="******", # ) # # print( # "--------------- Ray startup ------------\n{}".format( # ray.state.cluster_resources() # ) # ) agent_specs = {"AGENT-007": agent_spec} env_config = { "seed": 42, "scenarios": [scenario_paths], "headless": args.headless, "agent_specs": agent_specs, } # ==================================== # init tune config # ==================================== class MultiEnv(RLlibHiWayEnv): def __init__(self, env_config): env_config["scenarios"] = [ scenario_paths[(env_config.worker_index - 1) % len(scenario_paths)] ] super(MultiEnv, self).__init__(config=env_config) def step(self, agent_actions): for agent_id in agent_actions: repeat = int(agent_actions[agent_id][-1]) agent_actions = { agent_id: actions[:-1] for agent_id, actions in agent_actions.items() } if self.pre_act is not None and repeat > 0: obs, r, done, info = super().step(self.pre_act) else: obs, r, done, info = super().step(agent_actions) self.pre_act = agent_actions return obs, r, done, info def reset(self): self.pre_act = None return super().reset() ModelCatalog.register_custom_model("my_fc", FullyConnectedNetwork) ModelCatalog.register_custom_action_dist("my_dist", TorchDyDistribution) tune_config = { "env": MultiEnv, "env_config": env_config, "multiagent": { "policies": { "default_policy": ( None, OBSERVATION_SPACE, ACTION_SPACE, {}, ) }, "policy_mapping_fn": lambda agent_id: "default_policy", }, "model": { "custom_model": "my_fc", "custom_action_dist": "my_dist", }, "framework": "torch", "callbacks": { "on_episode_start": on_episode_start, "on_episode_step": on_episode_step, "on_episode_end": on_episode_end, }, "lr": 1e-4, "log_level": "WARN", "num_workers": args.num_workers, "horizon": args.horizon, "train_batch_size": 5120 * 3, # "observation_filter": "MeanStdFilter", # "batch_mode": "complete_episodes", # "grad_clip": 0.5, # "model":{ # "use_lstm": True, # }, } tune_config.update({ "lambda": 0.95, "clip_param": 0.2, "num_sgd_iter": 10, "sgd_minibatch_size": 512, "gamma": 0.995, "seed_global": tune.grid_search([10, 20, 30, 40]) }) # ==================================== # init log and checkpoint dir_info # ==================================== experiment_name = EXPERIMENT_NAME.format( scenario=args.exper, algorithm="PPO", n_agent=1, ) log_dir = Path(args.log_dir).expanduser().absolute() / RUN_NAME log_dir.mkdir(parents=True, exist_ok=True) print(f"Checkpointing at {log_dir}") if args.restore: restore_path = Path(args.restore).expanduser() print(f"Loading model from {restore_path}") else: restore_path = None # run experiments analysis = tune.run( PPOTrainer, # "PPO", name=experiment_name, stop={"timesteps_total": 10000000}, checkpoint_freq=20, checkpoint_at_end=True, local_dir=str(log_dir), resume=args.resume, restore=restore_path, max_failures=1000, export_formats=["model", "checkpoint"], config=tune_config, ) print(analysis.dataframe().head())
def setup(self, cfg: DictConfig): """ This method initializes and registers all necessary maze components with RLlib :param cfg: Full Hydra run job config """ # Generate a random state used for sampling random seeds for the envs and agents self.maze_seeding = MazeSeeding(cfg.seeding.env_base_seed, cfg.seeding.agent_base_seed, cfg.seeding.cudnn_determinism_flag) self._cfg = cfg # Initialize env factory (with rllib monkey patches) self.env_factory = build_maze_rllib_env_factory(cfg) # Register maze env factory with rllib tune.register_env("maze_env", lambda x: self.env_factory()) # Register maze model and distribution mapper if a maze model should be used # Check whether we are using the rllib default model composer or a maze model using_rllib_model_composer = '_target_' not in cfg.model.keys() if not using_rllib_model_composer: # Get model class model_cls = Factory(MazeRLlibBaseModel).type_from_name( cfg.algorithm.model_cls) # Register maze model ModelCatalog.register_custom_model("maze_model", model_cls) if 'policy' in cfg.model and "networks" in cfg.model.policy: assert len(cfg.model.policy.networks ) == 1, 'Hierarchical envs are not yet supported' # register maze action distribution ModelCatalog.register_custom_action_dist( 'maze_dist', MazeRLlibActionDistribution) model_config = { "custom_action_dist": 'maze_dist', "custom_model": "maze_model", "vf_share_layers": False, "custom_model_config": { "maze_model_composer_config": cfg.model, 'spaces_config_dump_file': self.spaces_config_dump_file, 'state_dict_dump_file': self.state_dict_dump_file } } else: # If specified use the default rllib model builder model_config = OmegaConf.to_container(cfg.model, resolve=True) # Build rllib config maze_rllib_config = { "env": "maze_env", # Store env config for possible later use "env_config": { 'env': cfg.env, 'wrappers': cfg.wrappers }, "model": model_config, 'callbacks': MazeRLlibLoggingCallbacks, "framework": "torch" } # Load the algorithm config and update the custom parameters rllib_config: Dict = OmegaConf.to_container(cfg.algorithm.config, resolve=True) assert 'model' not in rllib_config, 'The config should be removed from the default yaml files since it will ' \ 'be dynamically written' assert self.num_workers == rllib_config['num_workers'] rllib_config.update(maze_rllib_config) if rllib_config['seed'] is None: rllib_config[ 'seed'] = self.maze_seeding.generate_env_instance_seed() # Initialize ray with the passed ray_config parameters ray_config: Dict = OmegaConf.to_container(self.ray_config, resolve=True) # Load tune parameters tune_config = OmegaConf.to_container(self.tune_params, resolve=True) tune_config['callbacks'] = [MazeRLlibSaveModelCallback()] # Start tune experiment assert 'config' not in tune_config, 'The config should be removed from the default yaml files since it will ' \ 'be dynamically written' self.ray_config = ray_config self.rllib_config = rllib_config self.tune_config = tune_config
def persuasive_dqn_conf(rollout_size=10, agents=100, debug_folder=None, eval_folder=None, alpha=5e-4, gamma=0.99): """ https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/agents/trainer.py#L44 https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/agents/dqn/dqn.py#L21 https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/models/catalog.py#L37 """ # ModelCatalog.register_custom_model('custom_rrn', RNNModel) ModelCatalog.register_custom_action_dist('custom_action_distribution', PersuasiveActionDistribution) custom_configuration = dqn.DEFAULT_CONFIG.copy() custom_configuration['collect_metrics_timeout'] = 86400 # a day custom_configuration['framework'] = 'tf' custom_configuration['ignore_worker_failures'] = True custom_configuration['log_level'] = 'WARN' custom_configuration['monitor'] = True custom_configuration['num_cpus_for_driver'] = 1 custom_configuration['num_cpus_per_worker'] = 1 custom_configuration['num_envs_per_worker'] = 1 custom_configuration['output'] = debug_folder custom_configuration['remote_env_batch_wait_ms'] = 1000 custom_configuration['remote_worker_envs'] = False custom_configuration['seed'] = 42 # === Parallelism === # Number of workers for collecting samples with. This only makes sense # to increase if your environment is particularly slow to sample, or if # you"re using the Async or Ape-X optimizers. custom_configuration['num_workers'] = 0 custom_configuration['num_gpus_per_worker'] = 1 # Whether to compute priorities on workers. custom_configuration['worker_side_prioritization'] = False # Prevent iterations from going lower than this time span custom_configuration['min_iter_time_s'] = 1 # === Environment Settings === custom_configuration['batch_mode'] = 'complete_episodes' custom_configuration['callbacks'] = PersuasiveCallbacks custom_configuration['gamma'] = gamma custom_configuration['lr'] = alpha custom_configuration['no_done_at_end'] = False # Update the replay buffer with this many samples at once. Note that # this setting applies per-worker if num_workers > 1. custom_configuration['rollout_fragment_length'] = rollout_size # Size of a batch sampled from replay buffer for training. Note that # if async_updates is set, then each worker returns gradients for a # batch of this size. custom_configuration['train_batch_size'] = rollout_size * agents # If positive, input batches will be shuffled via a sliding window buffer # of this number of batches. Use this if the input data is not in random # enough order. Input is delayed until the shuffle buffer is filled. custom_configuration['shuffle_buffer_size'] = rollout_size * agents # Minimum env steps to optimize for per train call. This value does # not affect learning, only the length of train iterations. custom_configuration['timesteps_per_iteration'] = agents # How many steps of the model to sample before learning starts. custom_configuration['learning_starts'] = rollout_size * agents # === Exploration Settings === custom_configuration['exploration_config'] = {} # # https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/utils/exploration/epsilon_greedy.py # custom_configuration['exploration_config']['type'] = 'EpsilonGreedy' # custom_configuration['exploration_config']['initial_epsilon'] = 1.0 # custom_configuration['exploration_config']['final_epsilon'] = 0.02 # custom_configuration['exploration_config']['epsilon_timesteps'] = 10000 # https://github.com/ray-project/ray/blob/releases/1.0.0/rllib/utils/exploration/soft_q.py custom_configuration['exploration_config']['type'] = 'SoftQ' custom_configuration['exploration_config']['temperature'] = 1.0 # Default # Name of a custom action distribution to use. # See: https://docs.ray.io/en/releases-1.0.0/rllib-models.html#custom-action-distributions # custom_configuration['model']['custom_action_dist'] = 'custom_action_distribution' # === DQN Model Settings === # Update the target network every `target_network_update_freq` steps. # custom_configuration['target_network_update_freq'] = rollout_size custom_configuration['target_network_update_freq'] = agents # every agent should have done at least 1 action custom_configuration['n_step'] = 10 # Number of atoms for representing the distribution of return. When # this is greater than 1, distributional Q-learning is used. # the discrete supports are bounded by v_min and v_max custom_configuration['num_atoms'] = 1 custom_configuration['v_min'] = -10.0 custom_configuration['v_max'] = 10.0 # Whether to use noisy network custom_configuration['noisy'] = False # control the initial value of noisy nets custom_configuration['sigma0'] = 0.5 # Whether to use dueling dqn custom_configuration['dueling'] = False # True # Dense-layer setup for each the advantage branch and the value branch # in a dueling architecture. custom_configuration['hiddens'] = [256] # Whether to use double dqn custom_configuration['double_q'] = False # True # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then # each worker will have a replay buffer of this size. custom_configuration['buffer_size'] = 10000 # 50000 # If True prioritized replay buffer will be used. custom_configuration['prioritized_replay'] = False # Alpha parameter for prioritized replay buffer. custom_configuration['prioritized_replay_alpha'] = 0.6 # Beta parameter for sampling from prioritized replay buffer. custom_configuration['prioritized_replay_beta'] = 0.4 # Final value of beta (by default, we use constant beta=0.4). custom_configuration['final_prioritized_replay_beta'] = 0.4 # Time steps over which the beta parameter is annealed. custom_configuration['prioritized_replay_beta_annealing_timesteps'] = 20000 # Epsilon to add to the TD errors when updating priorities. custom_configuration['prioritized_replay_eps'] = 1e-6 # Whether to LZ4 compress observations custom_configuration['compress_observations'] = False # Callback to run before learning on a multi-agent batch of experiences. custom_configuration['before_learn_on_batch'] = None # If set, this will fix the ratio of replayed from a buffer and learned on # timesteps to sampled from an environment and stored in the replay buffer # timesteps. Otherwise, the replay will proceed at the native ratio # determined by (train_batch_size / rollout_fragment_length). custom_configuration['training_intensity'] = None # === Optimization === # Adam epsilon hyper parameter custom_configuration['adam_epsilon'] = 1e-8 # If not None, clip gradients during optimization at this value custom_configuration['grad_clip'] = 40 # === Evaluation Settings === # Evaluate with every `evaluation_interval` training iterations. # The evaluation stats will be reported under the "evaluation" metric key. # Note that evaluation is currently not parallelized, and that for Ape-X # metrics are already only reported for the lowest epsilon workers. custom_configuration['evaluation_interval'] = 1 # Number of episodes to run per evaluation period. If using multiple # evaluation workers, we will run at least this many episodes total. custom_configuration['evaluation_num_episodes'] = 5 # Internal flag that is set to True for evaluation workers. # DEFAUTL: 'in_evaluation': False, # Typical usage is to pass extra args to evaluation env creator # and to disable exploration by computing deterministic actions. # IMPORTANT NOTE: Policy gradient algorithms are able to find the optimal # policy, even if this is a stochastic one. Setting 'explore=False' here # will result in the evaluation workers not using this optimal policy! custom_configuration['evaluation_config']['explore'] = False custom_configuration['evaluation_config']['lr'] = 0 custom_configuration['evaluation_config']['num_gpus_per_worker'] = 0 custom_configuration['evaluation_config']['num_gpus'] = 0 custom_configuration['evaluation_config']['output'] = eval_folder # custom_configuration['evaluation_config']['env_config'] = {...}, # Number of parallel workers to use for evaluation. Note that this is set # to zero by default, which means evaluation will be run in the trainer # process. If you increase this, it will increase the Ray resource usage # of the trainer since evaluation workers are created separately from # rollout workers. custom_configuration['evaluation_num_workers'] = 1 # Customize the evaluation method. This must be a function of signature # (trainer: Trainer, eval_workers: WorkerSet) -> metrics: dict. See the # Trainer._evaluate() method to see the default implementation. The # trainer guarantees all eval workers have the latest policy state before # this function is called. custom_configuration['custom_eval_function'] = None #custom_eval_function return custom_configuration
high_obs = np.concatenate([high_obs, high_obs_fill]) observation_space_multi = gym.spaces.Box(low=low_obs, high=high_obs, dtype=np.float32) action_space_multi = gym.spaces.Discrete(15) # Register env's and custom stuff to RLlib for trainer to be able to use them. register_env( "BlueSkySrv", lambda env_config: BlueSkyServerMultiAgent( action_space_multi, observation_space_multi, settings. max_concurrent, env_config)) ModelCatalog.register_custom_model("Centralized", MyModelCentralized) ModelCatalog.register_custom_model("CentralizedLSTM", MyModelCentralized2) # ModelCatalog.register_custom_action_dist("BetaDistributionAction", BetaDistributionAction) # ModelCatalog.register_custom_action_dist("CategoricalOrdinal", CategoricalOrdinal) ModelCatalog.register_custom_action_dist("CategoricalOrdinalTFP", CategoricalOrdinalTFP) # Init ray. ray.init() # def explore(config): # # ensure we collect enough timesteps to do sgd # if config["train_batch_size"] < config["sgd_minibatch_size"] * 2: # config["train_batch_size"] = config["sgd_minibatch_size"] * 2 # # ensure we run at least one sgd iter # if config["num_sgd_iter"] < 1: # config["num_sgd_iter"] = 1 # return config # # pbt = PopulationBasedTraining( # time_attr="time_total_s",
) # batch_size x num_gaussians x action_dim cat_samples = self.cat.sample() # batch_size # First we need to expand cat so that it has the same dimension as normal samples cat_samples = cat_samples.view(-1, 1, 1).expand(-1, -1, self.action_dim) # We select the normal distribution based on the outputs of # the categorical distribution self.last_sample = torch.gather(normal_samples, 1, cat_samples).squeeze( dim=1) # batch_size x action_dim assert len( self.last_sample.shape) == 2, f"shape, {self.last_sample.shape}" return self.last_sample ModelCatalog.register_custom_action_dist("gmm", TorchGaussianMixtureDistribution) class TorchFlowDistribution(TorchDistributionWrapper): # https://github.com/ray-project/ray/blob/be62444bc5924c61d69bb6aec62f967e531e768c/rllib/examples/models/autoregressive_action_dist.py @staticmethod def required_model_output_shape(action_space, model_config): return prod(action_space.shape) def __init__(self, inputs: torch.Tensor, model: NormalizingFlowsPolicy): super(TorchDistributionWrapper, self).__init__(inputs, model) self.model = model self.batch_size, self.action_dim = inputs.shape self.device = inputs.device self.base_dist = Normal( torch.zeros(self.batch_size, self.action_dim, device=self.device),
[a1_logits, a2_logits]) self.action_model.summary() self.register_variables(self.action_model.variables) def forward(self, input_dict, state, seq_lens): context, self._value_out = self.base_model(input_dict["obs"]) return context, state def value_function(self): return tf.reshape(self._value_out, [-1]) if __name__ == "__main__": ray.init() args = parser.parse_args() ModelCatalog.register_custom_model("autoregressive_model", AutoregressiveActionsModel) ModelCatalog.register_custom_action_dist("binary_autoreg_output", BinaryAutoregressiveOutput) tune.run(args.run, stop={"episode_reward_mean": args.stop}, config={ "env": CorrelatedActionsEnv, "gamma": 0.5, "num_gpus": 0, "model": { "custom_model": "autoregressive_model", "custom_action_dist": "binary_autoreg_output", }, })
t1 = q.concentration1.lgamma() + q.concentration0.lgamma() + ( sum_params_p).lgamma() t2 = p.concentration1.lgamma() + p.concentration0.lgamma() + ( sum_params_q).lgamma() t3 = (p.concentration1 - q.concentration1) * torch.digamma( p.concentration1) t4 = (p.concentration0 - q.concentration0) * torch.digamma( p.concentration0) t5 = (sum_params_q - sum_params_p) * torch.digamma(sum_params_p) return (t1 - t2 + t3 + t4 + t5).sum(-1) def entropy(self): return self.dist.entropy().sum(-1) ModelCatalog.register_custom_action_dist("mydist", MyDist) ########### Do Training ################# def main(): ray.init() # Hyperparameters of PPO are not well tuned. Most of them refer to https://github.com/xtma/pytorch_car_caring/blob/master/train.py trainer = PPOTrainer(env="myenv", config={ "use_pytorch": True, "model": { "custom_model": "mymodel", "custom_options": { 'encoder_path': args.encoder_path, 'train_encoder': args.train_encoder,
super(DiscreteActionDistribution, self).__init__(inputs, model) self._dist = tfd.Categorical(logits=self.inputs, validate_args=True, allow_nan_stats=False) def sample(self): sample = self._dist.sample() self._last_sample_logp = self._dist.log_prob(sample) return sample def logp(self, action): action = tf.cast(action, tf.int32) return self._dist.log_prob(action) def sampled_action_logp(self): return self._last_sample_logp def entropy(self): return self._dist.entropy() def kl(self, other): """ Args: other: another DiscreteActionDistribution instance Returns: KL-Divergence between this distribution and other """ return self._dist.kl_divergence(other._dist) ModelCatalog.register_custom_action_dist('discrete_action_distribution', DiscreteActionDistribution)
return [ d.log_prob(a) for (a, d) in zip(action_parts, self._distributions) ] def _logp(self, action_parts): # print('action_parts:', action_parts) # print('self._logp_parts(action_parts):', self._logp_parts(action_parts)) logp_parts = self._logp_parts(action_parts) total_logp = 0 for term in logp_parts: total_logp += term return term #return tf.reduce_sum(tf.concat(self._logp_parts(action_parts), axis=-1), axis=-1) def _extract_action_parts(self, flat_action): sample_parts = [] next_free_idx = 0 for d in self._distributions: start_idx = next_free_idx next_free_idx += d.flat_sample_size() flat_sample = flat_action[..., start_idx:next_free_idx] shaped_sample = d.flat_to_event_shape(flat_sample) # print('Extracting action for distribution (dist: {}, start_idx: {}, next_free_idx: {}, flat_sample: {}, shaped_sample: {}' \ # .format(d, start_idx, next_free_idx, flat_sample, shaped_sample)) sample_parts.append(shaped_sample) return sample_parts ModelCatalog.register_custom_action_dist( 'categorical_gaussian_diag_action_dist', CategoricalGaussianDiagActionDist)
'log_std_range': args.log_std_range } if use_keras_model: for key in [ 'fcnet_hiddens', 'fcnet_activation', 'post_fcnet_hiddens', 'post_fcnet_activation', 'no_final_layer', 'vf_share_layers', 'free_log_std' ]: if key in config['model']: config['model']['custom_model_config'][key] = config[ 'model'][key] if args.action_distribution is not None: if args.action_distribution == 'truncated_normal': from model.custom_action_dist import TruncatedNormal ModelCatalog.register_custom_action_dist("truncated_normal", TruncatedNormal) config['model']['custom_action_dist'] = 'truncated_normal' if args.action_distribution == 'truncated_normal_zero_kl': from model.custom_action_dist import TruncatedNormalZeroKL ModelCatalog.register_custom_action_dist( "truncated_normal_zero_kl", TruncatedNormalZeroKL) config['model']['custom_action_dist'] = 'truncated_normal_zero_kl' if args.action_distribution == 'beta_alpha_beta': from model.custom_action_dist import BetaAlphaBeta ModelCatalog.register_custom_action_dist("beta_alpha_beta", BetaAlphaBeta) config['model']['custom_action_dist'] = 'beta_alpha_beta' config.update(env=env_name) if args.checkpoint is not None:
parser.add_argument("--run", type=str, default="PPO") # try PG, PPO, IMPALA parser.add_argument("--torch", action="store_true") parser.add_argument("--num-cpus", type=int, default=0) parser.add_argument("--as-test", action="store_true") parser.add_argument("--stop-iters", type=int, default=200) parser.add_argument("--stop-timesteps", type=int, default=100000) parser.add_argument("--stop-reward", type=float, default=200) if __name__ == "__main__": args = parser.parse_args() ray.init(num_cpus=args.num_cpus or None) ModelCatalog.register_custom_model( "autoregressive_model", TorchAutoregressiveActionModel if args.torch else AutoregressiveActionModel) ModelCatalog.register_custom_action_dist( "binary_autoreg_dist", TorchBinaryAutoregressiveDistribution if args.torch else BinaryAutoregressiveDistribution) config = { "env": CorrelatedActionsEnv, "gamma": 0.5, "num_gpus": 0, "model": { "custom_model": "autoregressive_model", "custom_action_dist": "binary_autoreg_dist", }, "use_pytorch": args.torch, } stop = { "training_iteration": args.stop_iters,
if __name__ == "__main__": args = get_cli_args() ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode) # main part: register and configure autoregressive action model and dist # here, tailored to the CorrelatedActionsEnv such that a2 depends on a1 ModelCatalog.register_custom_model( "autoregressive_model", TorchAutoregressiveActionModel if args.framework == "torch" else AutoregressiveActionModel, ) ModelCatalog.register_custom_action_dist( "binary_autoreg_dist", TorchBinaryAutoregressiveDistribution if args.framework == "torch" else BinaryAutoregressiveDistribution, ) # standard config config = { "env": CorrelatedActionsEnv, "gamma": 0.5, # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), "framework": args.framework, } # use registered model and dist in config if not args.no_autoreg: config["model"] = { "custom_model": "autoregressive_model",
from ray.rllib.models import ModelCatalog from ray.tune.registry import register_env from .model import ReallocationModel, Dirichlet from .env import create_env register_env("TradingEnv", create_env) ModelCatalog.register_custom_action_dist("dirichlet", Dirichlet) ModelCatalog.register_custom_model("reallocate", ReallocationModel)