def execution_plan(workers, config): rollouts = ParallelRollouts(workers, mode="bulk_sync") if config["microbatch_size"]: num_microbatches = math.ceil( config["train_batch_size"] / config["microbatch_size"]) # In microbatch mode, we want to compute gradients on experience # microbatches, average a number of these microbatches, and then apply # the averaged gradient in one SGD step. This conserves GPU memory, # allowing for extremely large experience batches to be used. train_op = ( rollouts.combine( ConcatBatches(min_batch_size=config["microbatch_size"])) .for_each(ComputeGradients(workers)) # (grads, info) .batch(num_microbatches) # List[(grads, info)] .for_each(AverageGradients()) # (avg_grads, info) .for_each(ApplyGradients(workers))) else: # In normal mode, we execute one SGD step per each train batch. train_op = rollouts \ .combine(ConcatBatches( min_batch_size=config["train_batch_size"])) \ .for_each(TrainOneStep(workers)) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers, config): rollouts = ParallelRollouts(workers, mode="bulk_sync") if config["simple_optimizer"]: train_op = rollouts \ .combine(ConcatBatches( min_batch_size=config["train_batch_size"])) \ .for_each(TrainOneStep( workers, num_sgd_iter=config["num_sgd_iter"])) else: replay_buffer = SimpleReplayBuffer(config["buffer_size"]) store_op = rollouts \ .for_each(StoreToReplayBuffer(local_buffer=replay_buffer)) replay_op = Replay(local_buffer=replay_buffer) \ .filter(WaitUntilTimestepsElapsed(config["learning_starts"])) \ .combine( ConcatBatches(min_batch_size=config["train_batch_size"])) \ .for_each(TrainOneStep( workers, num_sgd_iter=config["num_sgd_iter"])) train_op = Concurrently( [store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers, config, **kwargs): assert len(kwargs) == 0, ( "Alpha zero execution_plan does NOT take any additional parameters") rollouts = ParallelRollouts(workers, mode="bulk_sync") if config["simple_optimizer"]: train_op = rollouts.combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], )).for_each( TrainOneStep(workers, num_sgd_iter=config["num_sgd_iter"])) else: replay_buffer = SimpleReplayBuffer(config["buffer_size"]) store_op = rollouts \ .for_each(StoreToReplayBuffer(local_buffer=replay_buffer)) replay_op = Replay(local_buffer=replay_buffer) \ .filter(WaitUntilTimestepsElapsed(config["learning_starts"])) \ .combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], )) \ .for_each(TrainOneStep( workers, num_sgd_iter=config["num_sgd_iter"])) train_op = Concurrently( [store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers, config): # For A3C, compute policy gradients remotely on the rollout workers. # rollouts = ParallelRollouts(workers, mode="bulk_sync") grads = AsyncGradients(workers) # Apply the gradients as they arrive. We set update_all to False so that # only the worker sending the gradient is updated with new weights. #train_op = grads.for_each(ApplyGradients(workers, update_all=False)) print("_____") print(workers) temp1 = workers temp2 = workers rem1 = workers.remote_workers()[0:6] rem2 = workers.remote_workers()[6:11] temp1.reset(rem1) temp2.reset(rem2) rollouts1 = ParallelRollouts(temp1, mode="bulk_sync") rollouts2 = ParallelRollouts(temp2, mode="bulk_sync") train_step_op1 = TrainTFMultiGPU( workers=temp1, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) train_step_op2 = TrainTFMultiGPU( workers=temp2, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) train_op1 = rollouts1.combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"][ "count_steps_by"])).for_each(train_step_op1) train_op2 = rollouts2.combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"][ "count_steps_by"])).for_each(train_step_op2) #train_op = grads.for_each(ApplyGradients(workers, update_all=False)) return StandardMetricsReporting(train_op1, temp1, config).union(StandardMetricsReporting(train_op2, temp2, config))
def execution_plan(workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: """Execution plan of the A2C algorithm. Defines the distributed dataflow. Args: workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: A local iterator over training metrics. """ assert len(kwargs) == 0, ( "A2C execution_plan does NOT take any additional parameters") rollouts = ParallelRollouts(workers, mode="bulk_sync") if config["microbatch_size"]: num_microbatches = math.ceil(config["train_batch_size"] / config["microbatch_size"]) # In microbatch mode, we want to compute gradients on experience # microbatches, average a number of these microbatches, and then apply # the averaged gradient in one SGD step. This conserves GPU memory, # allowing for extremely large experience batches to be used. train_op = ( rollouts.combine( ConcatBatches(min_batch_size=config["microbatch_size"], count_steps_by=config["multiagent"] ["count_steps_by"])).for_each( ComputeGradients(workers)) # (grads, info) .batch(num_microbatches) # List[(grads, info)] .for_each(AverageGradients()) # (avg_grads, info) .for_each(ApplyGradients(workers))) else: # In normal mode, we execute one SGD step per each train batch. if config["simple_optimizer"]: train_step_op = TrainOneStep(workers) else: train_step_op = MultiGPUTrainOneStep( workers=workers, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) train_op = rollouts.combine( ConcatBatches(min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"] ["count_steps_by"])).for_each(train_step_op) return StandardMetricsReporting(train_op, workers, config)
def execution_plan( workers: WorkerSet, config: TrainerConfigDict, **kwargs ) -> LocalIterator[dict]: assert ( len(kwargs) == 0 ), "A2C execution_plan does NOT take any additional parameters" rollouts = ParallelRollouts(workers, mode="bulk_sync") if config["microbatch_size"]: num_microbatches = math.ceil( config["train_batch_size"] / config["microbatch_size"] ) # In microbatch mode, we want to compute gradients on experience # microbatches, average a number of these microbatches, and then # apply the averaged gradient in one SGD step. This conserves GPU # memory, allowing for extremely large experience batches to be # used. train_op = ( rollouts.combine( ConcatBatches( min_batch_size=config["microbatch_size"], count_steps_by=config["multiagent"]["count_steps_by"], ) ) .for_each(ComputeGradients(workers)) # (grads, info) .batch(num_microbatches) # List[(grads, info)] .for_each(AverageGradients()) # (avg_grads, info) .for_each(ApplyGradients(workers)) ) else: # In normal mode, we execute one SGD step per each train batch. if config["simple_optimizer"]: train_step_op = TrainOneStep(workers) else: train_step_op = MultiGPUTrainOneStep( workers=workers, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], _fake_gpus=config["_fake_gpus"], ) train_op = rollouts.combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], ) ).for_each(train_step_op) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers: WorkerSet, config: TrainerConfigDict) -> LocalIterator[dict]: """Execution plan of the MARWIL/BC algorithm. Defines the distributed dataflow. Args: workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: A local iterator over training metrics. """ rollouts = ParallelRollouts(workers, mode="bulk_sync") replay_buffer = SimpleReplayBuffer(config["replay_buffer_size"]) store_op = rollouts \ .for_each(StoreToReplayBuffer(local_buffer=replay_buffer)) replay_op = Replay(local_buffer=replay_buffer) \ .combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], )) \ .for_each(TrainOneStep(workers)) train_op = Concurrently([store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config)
def gather_experiences_directly(workers, config): rollouts = ParallelRollouts( workers, mode="async", num_async=config["max_requests_in_flight_per_sampler_worker"], ) # Augment with replay and concat to desired train batch size. train_batches = ( rollouts.for_each(lambda batch: batch.decompress_if_needed()) .for_each( MixInReplay( num_slots=config["replay_buffer_num_slots"], replay_proportion=config["replay_proportion"], ) ) .flatten() .combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], ) ) ) return train_batches
def default_execution_plan(workers: WorkerSet, config: TrainerConfigDict): # Collects experiences in parallel from multiple RolloutWorker actors. rollouts = ParallelRollouts(workers, mode="bulk_sync") # Combine experiences batches until we hit `train_batch_size` in size. # Then, train the policy on those experiences and update the workers. train_op = rollouts.combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], )) if config.get("simple_optimizer") is True: train_op = train_op.for_each(TrainOneStep(workers)) else: train_op = train_op.for_each( MultiGPUTrainOneStep( workers=workers, sgd_minibatch_size=config.get("sgd_minibatch_size", config["train_batch_size"]), num_sgd_iter=config.get("num_sgd_iter", 1), num_gpus=config["num_gpus"], shuffle_sequences=config.get("shuffle_sequences", False), _fake_gpus=config["_fake_gpus"], framework=config["framework"])) # Add on the standard episode reward, etc. metrics reporting. This returns # a LocalIterator[metrics_dict] representing metrics for each train step. return StandardMetricsReporting(train_op, workers, config)
def test_concat_batches(ray_start_regular_shared): workers = make_workers(0) a = ParallelRollouts(workers, mode="async") b = a.combine(ConcatBatches(1000)) assert next(b).count == 1000 timers = b.shared_metrics.get().timers assert "sample" in timers
def generator(): it = rollout_group.gather_async( num_async=config["max_sample_requests_in_flight_per_worker"]) # Update the rollout worker with our latest policy weights. def update_worker(item): worker, batch = item if self.weights: worker.set_weights.remote(self.weights, self.global_vars) return batch # Augment with replay and concat to desired train batch size. it = it.zip_with_source_actor() \ .for_each(update_worker) \ .for_each(lambda batch: batch.decompress_if_needed()) \ .for_each(MixInReplay( num_slots=config["replay_buffer_num_slots"], replay_proportion=config["replay_proportion"])) \ .flatten() \ .combine( ConcatBatches( min_batch_size=config["train_batch_size"])) for train_batch in it: yield train_batch
def execution_plan(workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: assert len(kwargs) == 0, ( "Marwill execution_plan does NOT take any additional parameters") rollouts = ParallelRollouts(workers, mode="bulk_sync") replay_buffer = MultiAgentReplayBuffer( learning_starts=config["learning_starts"], capacity=config["replay_buffer_size"], replay_batch_size=config["train_batch_size"], replay_sequence_length=1, ) store_op = rollouts \ .for_each(StoreToReplayBuffer(local_buffer=replay_buffer)) replay_op = Replay(local_buffer=replay_buffer) \ .combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], )) \ .for_each(TrainOneStep(workers)) train_op = Concurrently([store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config)
def execution_plan( workers: WorkerSet, config: TrainerConfigDict, **kwargs ) -> LocalIterator[dict]: assert ( len(kwargs) == 0 ), "QMIX execution_plan does NOT take any additional parameters" rollouts = ParallelRollouts(workers, mode="bulk_sync") replay_buffer = SimpleReplayBuffer(config["buffer_size"]) store_op = rollouts.for_each(StoreToReplayBuffer(local_buffer=replay_buffer)) train_op = ( Replay(local_buffer=replay_buffer) .combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], ) ) .for_each(TrainOneStep(workers)) .for_each( UpdateTargetNetwork(workers, config["target_network_update_freq"]) ) ) merged_op = Concurrently( [store_op, train_op], mode="round_robin", output_indexes=[1] ) return StandardMetricsReporting(merged_op, workers, config)
def execution_plan(workers, config): rollouts = ParallelRollouts(workers, mode="bulk_sync") # Collect large batches of relevant experiences & standardize. rollouts = rollouts.for_each( SelectExperiences(workers.trainable_policies())) rollouts = rollouts.combine( ConcatBatches(min_batch_size=config["train_batch_size"])) rollouts = rollouts.for_each(StandardizeFields(["advantages"])) if config["simple_optimizer"]: train_op = rollouts.for_each( TrainOneStep( workers, num_sgd_iter=config["num_sgd_iter"], sgd_minibatch_size=config["sgd_minibatch_size"])) else: train_op = rollouts.for_each( TrainTFMultiGPU( workers, sgd_minibatch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], rollout_fragment_length=config["rollout_fragment_length"], num_envs_per_worker=config["num_envs_per_worker"], train_batch_size=config["train_batch_size"], shuffle_sequences=config["shuffle_sequences"], _fake_gpus=config["_fake_gpus"])) # Update KL after each round of training. train_op = train_op.for_each(lambda t: t[1]).for_each(UpdateKL(workers)) return StandardMetricsReporting(train_op, workers, config) \ .for_each(lambda result: warn_about_bad_reward_scales(config, result))
def custom_training_workflow(workers: WorkerSet, config: dict): local_replay_buffer = LocalReplayBuffer(num_shards=1, learning_starts=1000, buffer_size=50000, replay_batch_size=64) def add_ppo_metrics(batch): print("PPO policy learning on samples from", batch.policy_batches.keys(), "env steps", batch.env_steps(), "agent steps", batch.env_steps()) metrics = _get_shared_metrics() metrics.counters["agent_steps_trained_PPO"] += batch.env_steps() return batch def add_dqn_metrics(batch): print("DQN policy learning on samples from", batch.policy_batches.keys(), "env steps", batch.env_steps(), "agent steps", batch.env_steps()) metrics = _get_shared_metrics() metrics.counters["agent_steps_trained_DQN"] += batch.env_steps() return batch # Generate common experiences. rollouts = ParallelRollouts(workers, mode="bulk_sync") r1, r2 = rollouts.duplicate(n=2) # DQN sub-flow. dqn_store_op = r1.for_each(SelectExperiences(["dqn_policy"])) \ .for_each( StoreToReplayBuffer(local_buffer=local_replay_buffer)) dqn_replay_op = Replay(local_buffer=local_replay_buffer) \ .for_each(add_dqn_metrics) \ .for_each(TrainOneStep(workers, policies=["dqn_policy"])) \ .for_each(UpdateTargetNetwork( workers, target_update_freq=500, policies=["dqn_policy"])) dqn_train_op = Concurrently([dqn_store_op, dqn_replay_op], mode="round_robin", output_indexes=[1]) # PPO sub-flow. ppo_train_op = r2.for_each(SelectExperiences(["ppo_policy"])) \ .combine(ConcatBatches( min_batch_size=200, count_steps_by="env_steps")) \ .for_each(add_ppo_metrics) \ .for_each(StandardizeFields(["advantages"])) \ .for_each(TrainOneStep( workers, policies=["ppo_policy"], num_sgd_iter=10, sgd_minibatch_size=128)) # Combined training flow train_op = Concurrently([ppo_train_op, dqn_train_op], mode="async", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers: WorkerSet, config: TrainerConfigDict) -> LocalIterator[dict]: """Execution plan of the PPO algorithm. Defines the distributed dataflow. Args: workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: The Policy class to use with PPOTrainer. If None, use `default_policy` provided in build_trainer(). """ rollouts = ParallelRollouts(workers, mode="bulk_sync") # Collect batches for the trainable policies. rollouts = rollouts.for_each( SelectExperiences(workers.trainable_policies())) # Concatenate the SampleBatches into one. rollouts = rollouts.combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], )) # Standardize advantages. rollouts = rollouts.for_each(StandardizeFields(["advantages"])) # Perform one training step on the combined + standardized batch. if config["simple_optimizer"]: train_op = rollouts.for_each( TrainOneStep( workers, num_sgd_iter=config["num_sgd_iter"], sgd_minibatch_size=config["sgd_minibatch_size"])) else: train_op = rollouts.for_each( TrainTFMultiGPU( workers, sgd_minibatch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], rollout_fragment_length=config["rollout_fragment_length"], num_envs_per_worker=config["num_envs_per_worker"], train_batch_size=config["train_batch_size"], shuffle_sequences=config["shuffle_sequences"], _fake_gpus=config["_fake_gpus"], framework=config.get("framework"))) # Update KL after each round of training. train_op = train_op.for_each(lambda t: t[1]).for_each(UpdateKL(workers)) # Warn about bad reward scales and return training metrics. return StandardMetricsReporting(train_op, workers, config) \ .for_each(lambda result: warn_about_bad_reward_scales(config, result))
def execution_plan(workers, config): rollouts = ParallelRollouts(workers, mode="bulk_sync") train_op = rollouts \ .for_each(MixInReplay(config["buffer_size"])) \ .combine( ConcatBatches(min_batch_size=config["train_batch_size"])) \ .for_each(TrainOneStep(workers)) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"])) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers, config): rollouts = ParallelRollouts(workers, mode="bulk_sync") if config["microbatch_size"]: num_microbatches = math.ceil(config["train_batch_size"] / config["microbatch_size"]) # In microbatch mode, we want to compute gradients on experience # microbatches, average a number of these microbatches, and then apply # the averaged gradient in one SGD step. This conserves GPU memory, # allowing for extremely large experience batches to be used. train_op = ( rollouts.combine( ConcatBatches(min_batch_size=config["microbatch_size"], count_steps_by=config["multiagent"] ["count_steps_by"])).for_each( ComputeGradients(workers)) # (grads, info) .batch(num_microbatches) # List[(grads, info)] .for_each(AverageGradients()) # (avg_grads, info) .for_each(ApplyGradients(workers))) else: # In normal mode, we execute one SGD step per each train batch. if config["simple_optimizer"]: train_step_op = TrainOneStep(workers) else: train_step_op = TrainTFMultiGPU( workers=workers, sgd_minibatch_size=config["train_batch_size"], num_sgd_iter=1, num_gpus=config["num_gpus"], shuffle_sequences=True, _fake_gpus=config["_fake_gpus"], framework=config.get("framework")) train_op = rollouts.combine( ConcatBatches(min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"] ["count_steps_by"])).for_each(train_step_op) return StandardMetricsReporting(train_op, workers, config)
def default_execution_plan(workers: WorkerSet, config: TrainerConfigDict): # Collects experiences in parallel from multiple RolloutWorker actors. rollouts = ParallelRollouts(workers, mode="bulk_sync") # Combine experiences batches until we hit `train_batch_size` in size. # Then, train the policy on those experiences and update the workers. train_op = rollouts \ .combine(ConcatBatches( min_batch_size=config["train_batch_size"])) \ .for_each(TrainOneStep(workers)) # Add on the standard episode reward, etc. metrics reporting. This returns # a LocalIterator[metrics_dict] representing metrics for each train step. return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers, config): rollouts = ParallelRollouts(workers, mode="bulk_sync") replay_buffer = SimpleReplayBuffer(config["replay_buffer_size"]) store_op = rollouts \ .for_each(StoreToReplayBuffer(local_buffer=replay_buffer)) replay_op = Replay(local_buffer=replay_buffer) \ .combine( ConcatBatches(min_batch_size=config["train_batch_size"])) \ .for_each(TrainOneStep(workers)) train_op = Concurrently([store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers, config): rollouts = ParallelRollouts(workers, mode="bulk_sync") # Collect large batches of relevant experiences & standardize. rollouts = rollouts.for_each( SelectExperiences(workers.trainable_policies())) rollouts = rollouts.combine( ConcatBatches(min_batch_size=config["train_batch_size"])) rollouts = rollouts.for_each(StandardizeFields(["advantages"])) if config["simple_optimizer"]: train_op = rollouts.for_each( TrainOneStep(workers, num_sgd_iter=config["num_sgd_iter"], sgd_minibatch_size=config["sgd_minibatch_size"])) else: train_op = rollouts.for_each( TrainTFMultiGPU( workers, sgd_minibatch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], rollout_fragment_length=config["rollout_fragment_length"], num_envs_per_worker=config["num_envs_per_worker"], train_batch_size=config["train_batch_size"], shuffle_sequences=config["shuffle_sequences"], _fake_gpus=config["_fake_gpus"])) # Callback to update the KL based on optimization info. def update_kl(item): _, fetches = item def update(pi, pi_id): if pi_id in fetches: pi.update_kl(fetches[pi_id]["kl"]) else: logger.warning("No data for {}, not updating kl".format(pi_id)) workers.local_worker().foreach_trainable_policy(update) # Update KL after each round of training. train_op = train_op.for_each(update_kl) return StandardMetricsReporting(train_op, workers, config) \ .for_each(lambda result: _warn_about_bad_reward_scales(config, result))
def execution_plan(workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: assert (len(kwargs) == 0 ), "PPO execution_plan does NOT take any additional parameters" rollouts = ParallelRollouts(workers, mode="bulk_sync") # Collect batches for the trainable policies. rollouts = rollouts.for_each( SelectExperiences(local_worker=workers.local_worker())) # Concatenate the SampleBatches into one. rollouts = rollouts.combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], )) # Standardize advantages. rollouts = rollouts.for_each(StandardizeFields(["advantages"])) # Perform one training step on the combined + standardized batch. if config["simple_optimizer"]: train_op = rollouts.for_each( TrainOneStep( workers, num_sgd_iter=config["num_sgd_iter"], sgd_minibatch_size=config["sgd_minibatch_size"], )) else: train_op = rollouts.for_each( MultiGPUTrainOneStep( workers=workers, sgd_minibatch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], _fake_gpus=config["_fake_gpus"], )) # Update KL after each round of training. train_op = train_op.for_each(lambda t: t[1]).for_each( UpdateKL(workers)) # Warn about bad reward scales and return training metrics. return StandardMetricsReporting(train_op, workers, config).for_each( lambda result: warn_about_bad_reward_scales(config, result))
def execution_plan(workers: WorkerSet, config: TrainerConfigDict, **kwargs) -> LocalIterator[dict]: """Execution plan of the MARWIL/BC algorithm. Defines the distributed dataflow. Args: workers (WorkerSet): The WorkerSet for training the Polic(y/ies) of the Trainer. config (TrainerConfigDict): The trainer's configuration dict. Returns: LocalIterator[dict]: A local iterator over training metrics. """ assert len(kwargs) == 0, ( "Marwill execution_plan does NOT take any additional parameters") rollouts = ParallelRollouts(workers, mode="bulk_sync") replay_buffer = LocalReplayBuffer( learning_starts=config["learning_starts"], capacity=config["replay_buffer_size"], replay_batch_size=config["train_batch_size"], replay_sequence_length=1, ) store_op = rollouts \ .for_each(StoreToReplayBuffer(local_buffer=replay_buffer)) replay_op = Replay(local_buffer=replay_buffer) \ .combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"], )) \ .for_each(TrainOneStep(workers)) train_op = Concurrently([store_op, replay_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config)
def execution_plan(workers, config): rollouts = ParallelRollouts(workers, mode="bulk_sync") replay_buffer = SimpleReplayBuffer(config["buffer_size"]) store_op = rollouts \ .for_each(StoreToReplayBuffer(local_buffer=replay_buffer)) train_op = Replay(local_buffer=replay_buffer) \ .combine( ConcatBatches( min_batch_size=config["train_batch_size"], count_steps_by=config["multiagent"]["count_steps_by"] )) \ .for_each(TrainOneStep(workers)) \ .for_each(UpdateTargetNetwork( workers, config["target_network_update_freq"])) merged_op = Concurrently([store_op, train_op], mode="round_robin", output_indexes=[1]) return StandardMetricsReporting(merged_op, workers, config)
def custom_training_workflow_ppo_ddpg(workers: WorkerSet, config: dict): local_replay_buffer = LocalReplayBuffer(num_shards=1, learning_starts=1000, buffer_size=50000, replay_batch_size=64) def add_ppo_metrics(batch): print("PPO policy learning on samples from", batch.policy_batches.keys(), "env steps", batch.env_steps(), "agent steps", batch.env_steps()) metrics = _get_shared_metrics() metrics.counters["agent_steps_trained_PPO"] += batch.env_steps() return batch def add_ddpg_metrics(batch): print("DDPG policy learning on samples from", batch.policy_batches.keys(), "env steps", batch.env_steps(), "agent steps", batch.env_steps()) metrics = _get_shared_metrics() metrics.counters["agent_steps_trained_DDPG"] += batch.env_steps() return batch # Generate common experiences. rollouts = ParallelRollouts(workers, mode="bulk_sync") r1, r2 = rollouts.duplicate(n=2) # PPO sub-flow. ppo_train_op = r2.for_each(SelectExperiences(["PPO_policy"])) \ .combine(ConcatBatches( min_batch_size=200)) \ .for_each(add_ppo_metrics) \ .for_each(StandardizeFields(["advantages"])) \ .for_each(TrainOneStep( workers, policies=["PPO_policy"], num_sgd_iter=10, sgd_minibatch_size=128)) # DDPG sub-flow. ddpg_train_op = r2.for_each(SelectExperiences(["DDPG_policy"])) \ .combine(ConcatBatches( min_batch_size=200)) \ .for_each(add_ddpg_metrics) \ .for_each(StandardizeFields(["advantages"])) \ .for_each(TrainOneStep( workers, policies=["DDPG_policy"], num_sgd_iter=10, sgd_minibatch_size=128)) # , count_steps_by="env_steps")) \ # Combined training flow train_op = Concurrently([ppo_train_op, ddpg_train_op], mode="async", output_indexes=[1]) return StandardMetricsReporting(train_op, workers, config) # if __name__ == "__main__": # args = parser.parse_args() # assert not (args.torch and args.mixed_torch_tf),\ # "Use either --torch or --mixed-torch-tf, not both!" # ray.init() # # Simple environment with 4 independent cartpole entities # register_env("multi_agent_cartpole", # lambda _: MultiAgentCartPole({"num_agents": 4})) # single_env = gym.make("CartPole-v0") # obs_space = single_env.observation_space # act_space = single_env.action_space # # Note that since the trainer below does not include a default policy or # # policy configs, we have to explicitly set it in the multiagent config: # policies = { # "ppo_policy": (PPOTorchPolicy if args.torch or args.mixed_torch_tf else # PPOTFPolicy, obs_space, act_space, PPO_CONFIG), # "dqn_policy": (DQNTorchPolicy if args.torch else DQNTFPolicy, # obs_space, act_space, DQN_CONFIG), # } # def policy_mapping_fn(agent_id): # if agent_id % 2 == 0: # return "ppo_policy" # else: # return "dqn_policy" # MyTrainer = build_trainer( # name="PPO_DQN_MultiAgent", # default_policy=None, # execution_plan=custom_training_workflow) # config = { # "rollout_fragment_length": 50, # "num_workers": 0, # "env": "multi_agent_cartpole", # "multiagent": { # "policies": policies, # "policy_mapping_fn": policy_mapping_fn, # "policies_to_train": ["dqn_policy", "ppo_policy"], # }, # # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. # "num_gpus": int(os.environ.get("RLLIB_NUM_GPUS", "0")), # "framework": "torch" if args.torch else "tf", # "_use_trajectory_view_api": True, # } # stop = { # "training_iteration": args.stop_iters, # "timesteps_total": args.stop_timesteps, # "episode_reward_mean": args.stop_reward, # } # results = tune.run(MyTrainer, config=config, stop=stop) # if args.as_test: # check_learning_achieved(results, args.stop_reward) # ray.shutdown()