def custom_training_workflow(workers: WorkerSet, config: dict): local_replay_buffer = LocalReplayBuffer(num_shards=1, learning_starts=1000, buffer_size=50000, replay_batch_size=64) def add_ppo_metrics(batch): print("PPO policy learning on samples from", batch.policy_batches.keys(), "env steps", batch.env_steps(), "agent steps", batch.env_steps()) metrics = _get_shared_metrics() metrics.counters["agent_steps_trained_PPO"] += batch.env_steps() return batch def add_dqn_metrics(batch): print("DQN policy learning on samples from", batch.policy_batches.keys(), "env steps", batch.env_steps(), "agent steps", batch.env_steps()) metrics = _get_shared_metrics() metrics.counters["agent_steps_trained_DQN"] += batch.env_steps() return batch # Generate common experiences. rollouts = ParallelRollouts(workers, mode="bulk_sync") r1, r2 = rollouts.duplicate(n=2) # DQN sub-flow. dqn_store_op = r1.for_each(SelectExperiences(["dqn_policy"])) \ .for_each( StoreToReplayBuffer(local_buffer=local_replay_buffer)) dqn_replay_op = Replay(local_buffer=local_replay_buffer) \ .for_each(add_dqn_metrics) \ .for_each(TrainOneStep(workers, policies=["dqn_policy"])) \ .for_each(UpdateTargetNetwork( workers, target_update_freq=500, policies=["dqn_policy"])) dqn_train_op = Concurrently([dqn_store_op, dqn_replay_op], mode="round_robin", output_indexes=[1]) # PPO sub-flow. ppo_train_op = r2.for_each(SelectExperiences(["ppo_policy"])) \ .combine(ConcatBatches( min_batch_size=200, count_steps_by="env_steps")) \ .for_each(add_ppo_metrics) \ .for_each(StandardizeFields(["advantages"])) \ .for_each(TrainOneStep( workers, policies=["ppo_policy"], num_sgd_iter=10, sgd_minibatch_size=128)) # Combined training flow train_op = Concurrently([ppo_train_op, dqn_train_op], mode="async", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config)
def custom_training_workflow_ppo_ddpg(workers: WorkerSet, config: dict): local_replay_buffer = LocalReplayBuffer(num_shards=1, learning_starts=1000, buffer_size=50000, replay_batch_size=64) def add_ppo_metrics(batch): print("PPO policy learning on samples from", batch.policy_batches.keys(), "env steps", batch.env_steps(), "agent steps", batch.env_steps()) metrics = _get_shared_metrics() metrics.counters["agent_steps_trained_PPO"] += batch.env_steps() return batch def add_ddpg_metrics(batch): print("DDPG policy learning on samples from", batch.policy_batches.keys(), "env steps", batch.env_steps(), "agent steps", batch.env_steps()) metrics = _get_shared_metrics() metrics.counters["agent_steps_trained_DDPG"] += batch.env_steps() return batch # Generate common experiences. rollouts = ParallelRollouts(workers, mode="bulk_sync") r1, r2 = rollouts.duplicate(n=2) # PPO sub-flow. ppo_train_op = r2.for_each(SelectExperiences(["PPO_policy"])) \ .combine(ConcatBatches( min_batch_size=200)) \ .for_each(add_ppo_metrics) \ .for_each(StandardizeFields(["advantages"])) \ .for_each(TrainOneStep( workers, policies=["PPO_policy"], num_sgd_iter=10, sgd_minibatch_size=128)) # DDPG sub-flow. ddpg_train_op = r2.for_each(SelectExperiences(["DDPG_policy"])) \ .combine(ConcatBatches( min_batch_size=200)) \ .for_each(add_ddpg_metrics) \ .for_each(StandardizeFields(["advantages"])) \ .for_each(TrainOneStep( workers, policies=["DDPG_policy"], num_sgd_iter=10, sgd_minibatch_size=128)) # , count_steps_by="env_steps")) \ # Combined training flow train_op = Concurrently([ppo_train_op, ddpg_train_op], mode="async", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config) # if __name__ == "__main__": # args = parser.parse_args() # assert not (args.torch and args.mixed_torch_tf),\ # "Use either --torch or --mixed-torch-tf, not both!" # ray.init() # # Simple environment with 4 independent cartpole entities # register_env("multi_agent_cartpole", # lambda _: MultiAgentCartPole({"num_agents": 4})) # single_env = gym.make("CartPole-v0") # obs_space = single_env.observation_space # act_space = single_env.action_space # # Note that since the trainer below does not include a default policy or # # policy configs, we have to explicitly set it in the multiagent config: # policies = { # "ppo_policy": (PPOTorchPolicy if args.torch or args.mixed_torch_tf else # PPOTFPolicy, obs_space, act_space, PPO_CONFIG), # "dqn_policy": (DQNTorchPolicy if args.torch else DQNTFPolicy, # obs_space, act_space, DQN_CONFIG), # } # def policy_mapping_fn(agent_id): # if agent_id % 2 == 0: # return "ppo_policy" # else: # return "dqn_policy" # MyTrainer = build_trainer( # name="PPO_DQN_MultiAgent", # default_policy=None, # execution_plan=custom_training_workflow) # config = { # "rollout_fragment_length": 50, # "num_workers": 0, # "env": "multi_agent_cartpole", # "multiagent": { # "policies": policies, # "policy_mapping_fn": policy_mapping_fn, # "policies_to_train": ["dqn_policy", "ppo_policy"], # }, # # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. # "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), # "framework": "torch" if args.torch else "tf", # "_use_trajectory_view_api": True, # } # stop = { # "training_iteration": args.stop_iters, # "timesteps_total": args.stop_timesteps, # "episode_reward_mean": args.stop_reward, # } # results = tune.run(MyTrainer, config=config, stop=stop) # if args.as_test: # check_learning_achieved(results, args.stop_reward) # ray.shutdown()