def handle_distributed_coach_orchestrator(graph_manager, args): ckpt_inside_container = "/checkpoint" rollout_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.ROLLOUT_WORKER)] + sys.argv[1:] trainer_command = ['python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.TRAINER)] + sys.argv[1:] if '--experiment_name' not in rollout_command: rollout_command = rollout_command + ['--experiment_name', args.experiment_name] if '--experiment_name' not in trainer_command: trainer_command = trainer_command + ['--experiment_name', args.experiment_name] memory_backend_params = None if args.memory_backend == "redispubsub": memory_backend_params = RedisPubSubMemoryBackendParameters() ds_params_instance = None if args.data_store == "s3": ds_params = DataStoreParameters("s3", "", "") ds_params_instance = S3DataStoreParameters(ds_params=ds_params, end_point=args.s3_end_point, bucket_name=args.s3_bucket_name, creds_file=args.s3_creds_file, checkpoint_dir=ckpt_inside_container) elif args.data_store == "nfs": ds_params = DataStoreParameters("nfs", "kubernetes", "") ds_params_instance = NFSDataStoreParameters(ds_params) worker_run_type_params = RunTypeParameters(args.image, rollout_command, run_type=str(RunType.ROLLOUT_WORKER), num_replicas=args.num_workers) trainer_run_type_params = RunTypeParameters(args.image, trainer_command, run_type=str(RunType.TRAINER)) orchestration_params = KubernetesParameters([worker_run_type_params, trainer_run_type_params], kubeconfig='~/.kube/config', memory_backend_parameters=memory_backend_params, data_store_params=ds_params_instance) orchestrator = Kubernetes(orchestration_params) if not orchestrator.setup(): print("Could not setup.") return if orchestrator.deploy_trainer(): print("Successfully deployed trainer.") else: print("Could not deploy trainer.") return if orchestrator.deploy_worker(): print("Successfully deployed rollout worker(s).") else: print("Could not deploy rollout worker(s).") return try: orchestrator.trainer_logs() except KeyboardInterrupt: pass orchestrator.undeploy()
def handle_distributed_coach_orchestrator(args): from rl_coach.orchestrators.kubernetes_orchestrator import KubernetesParameters, Kubernetes, \ RunTypeParameters ckpt_inside_container = "/checkpoint" arg_list = sys.argv[1:] try: i = arg_list.index('--distributed_coach_run_type') arg_list.pop(i) arg_list.pop(i) except ValueError: pass trainer_command = [ 'python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.TRAINER) ] + arg_list rollout_command = [ 'python3', 'rl_coach/coach.py', '--distributed_coach_run_type', str(RunType.ROLLOUT_WORKER) ] + arg_list if '--experiment_name' not in rollout_command: rollout_command = rollout_command + [ '--experiment_name', args.experiment_name ] if '--experiment_name' not in trainer_command: trainer_command = trainer_command + [ '--experiment_name', args.experiment_name ] memory_backend_params = None if args.memory_backend == "redispubsub": memory_backend_params = RedisPubSubMemoryBackendParameters() ds_params_instance = None if args.data_store == "s3": ds_params = DataStoreParameters("s3", "", "") ds_params_instance = S3DataStoreParameters( ds_params=ds_params, end_point=args.s3_end_point, bucket_name=args.s3_bucket_name, creds_file=args.s3_creds_file, checkpoint_dir=ckpt_inside_container, expt_dir=args.experiment_path) elif args.data_store == "nfs": ds_params = DataStoreParameters("nfs", "kubernetes", "") ds_params_instance = NFSDataStoreParameters(ds_params) worker_run_type_params = RunTypeParameters(args.image, rollout_command, run_type=str( RunType.ROLLOUT_WORKER), num_replicas=args.num_workers) trainer_run_type_params = RunTypeParameters(args.image, trainer_command, run_type=str(RunType.TRAINER)) orchestration_params = KubernetesParameters( [worker_run_type_params, trainer_run_type_params], kubeconfig='~/.kube/config', memory_backend_parameters=memory_backend_params, data_store_params=ds_params_instance) orchestrator = Kubernetes(orchestration_params) if not orchestrator.setup(): print("Could not setup.") return 1 if orchestrator.deploy_trainer(): print("Successfully deployed trainer.") else: print("Could not deploy trainer.") return 1 if orchestrator.deploy_worker(): print("Successfully deployed rollout worker(s).") else: print("Could not deploy rollout worker(s).") return 1 if args.dump_worker_logs: screen.log_title("Dumping rollout worker logs in: {}".format( args.experiment_path)) orchestrator.worker_logs(path=args.experiment_path) exit_code = 1 try: exit_code = orchestrator.trainer_logs() except KeyboardInterrupt: pass orchestrator.undeploy() return exit_code