def _build_parallel_ctrl(self, n_worker): self.ctrl = AttrDict( quit=mp.RawValue(ctypes.c_bool, False), barrier_in=mp.Barrier(n_worker + 1), barrier_out=mp.Barrier(n_worker + 1), do_eval=mp.RawValue(ctypes.c_bool, False), itr=mp.RawValue(ctypes.c_long, 0), ) self.traj_infos_queue = mp.Queue() self.eval_traj_infos_queue = mp.Queue() self.sync = AttrDict(stop_eval=mp.RawValue(ctypes.c_bool, False))
def initialize(self, affinity): """Initialization inside the main sampler process. Sets process hardware affinities, creates specified number of environment instances and instantiates the collector with them. If applicable, does the same for evaluation environment instances. Moves the agent to device (could be GPU), and calls on ``agent.async_cpu()`` initialization. Starts up collector. """ p = psutil.Process() if affinity.get("set_affinity", True): p.cpu_affinity(affinity["master_cpus"]) # torch.set_num_threads(affinity["master_torch_threads"]) torch.set_num_threads(1) # Needed to prevent MKL hang :( . B = self.batch_spec.B envs = [self.EnvCls(**self.env_kwargs) for _ in range(B)] sync = AttrDict( db_idx=AttrDict(value=0)) # Mimic the mp.RawValue format. collector = self.CollectorCls( rank=0, envs=envs, samples_np=self.double_buffer, batch_T=self.batch_spec.T, TrajInfoCls=self.TrajInfoCls, agent=self.agent, sync=sync, ) if self.eval_n_envs > 0: eval_envs = [ self.EnvCls(**self.eval_env_kwargs) for _ in range(self.eval_n_envs) ] eval_CollectorCls = self.eval_CollectorCls or SerialEvalCollector self.eval_collector = eval_CollectorCls( envs=eval_envs, agent=self.agent, TrajInfoCls=self.TrajInfoCls, max_T=self.eval_max_steps // self.eval_n_envs, max_trajectories=self.eval_max_trajectories, ) self.agent.to_device(cuda_idx=affinity.get("cuda_idx", None)) self.agent.async_cpu(share_memory=False) agent_inputs, traj_infos = collector.start_envs( self.max_decorrelation_steps) collector.start_agent() self.collector = collector self.agent_inputs = agent_inputs self.traj_infos = traj_infos self.sync = sync logger.log("Serial sampler initialized.")
def launch_memcpy(self, sample_buffers, replay_buffer): """ Fork a Python process for each of the sampler double buffers. (It may be overkill to use two separate processes here, may be able to simplify to one and still get good performance.) """ procs = list() for i in range(len(sample_buffers)): # (2 for double-buffer.) ctrl = AttrDict( quit=self.ctrl.quit, sample_ready=self.ctrl.sample_ready[i], sample_copied=self.ctrl.sample_copied[i], ) procs.append( mp.Process( target=memory_copier, args=( sample_buffers[i], self.algo.samples_to_buffer, replay_buffer, ctrl, ), )) for p in procs: p.start() self.memcpy_procs = procs
def _build_parallel_ctrl(self, n_worker): self.ctrl = AttrDict( quit=mp.RawValue(ctypes.c_bool, False), barrier_in=mp.Barrier(n_worker + 1), barrier_out=mp.Barrier(n_worker + 1), do_eval=mp.RawValue(ctypes.c_bool, False), itr=mp.RawValue(ctypes.c_long, 0), # TODO SAVE state of curriculum? ) self.traj_infos_queue = mp.Queue() self.eval_traj_infos_queue = mp.Queue() self.sync = AttrDict(stop_eval=mp.RawValue(ctypes.c_bool, False), glob_average_return=mp.Value('d', 0.0), curriculum_stage=mp.Value('i', 0), difficulty=mp.Value('d', 0.0), seeds=mp.Array('i', n_worker))
def assemble_workers_kwargs(affinity, seed, samples_np, n_envs_list, step_buffer_np, sync, eval_n_envs, eval_step_buffer_np): workers_kwargs = list() i_env = 0 for rank in range(len(affinity["workers_cpus"])): n_envs = n_envs_list[rank] slice_B = slice(i_env, i_env + n_envs) w_sync = AttrDict( step_blocker=sync.step_blockers[rank], act_waiter=sync.act_waiters[rank], stop_eval=sync.stop_eval, ) worker_kwargs = dict( rank=rank, seed=seed + rank, cpus=affinity["workers_cpus"][rank], n_envs=n_envs, samples_np=samples_np[:, slice_B], step_buffer_np=step_buffer_np[slice_B], sync=w_sync, ) i_env += n_envs if eval_n_envs > 0: eval_slice_B = slice(rank * eval_n_envs, (rank + 1) * eval_n_envs) worker_kwargs["eval_step_buffer_np"] = eval_step_buffer_np[ eval_slice_B] workers_kwargs.append(worker_kwargs) return workers_kwargs
def sampling_process(common_kwargs, worker_kwargs): """Arguments fed from the Sampler class in master process.""" c, w = AttrDict(**common_kwargs), AttrDict(**worker_kwargs) initialize_worker(w.rank, w.seed, w.cpus, c.torch_threads, w.get("group", None)) envs = [c.EnvCls(**c.env_kwargs) for _ in range(w.n_envs)] collector = c.CollectorCls( rank=w.rank, envs=envs, samples_np=w.samples_np, batch_T=c.batch_T, TrajInfoCls=c.TrajInfoCls, agent=c.get("agent", None), # Optional depending on parallel setup. sync=w.get("sync", None), step_buffer_np=w.get("step_buffer_np", None), ) agent_inputs, traj_infos = collector.start_envs(c.max_decorrelation_steps) collector.start_agent() eval_envs = [c.EnvCls(**c.eval_env_kwargs) for _ in range(c.eval_n_envs)] if eval_envs: # May do evaluation. eval_collector = c.eval_CollectorCls( rank=w.rank, envs=eval_envs, TrajInfoCls=c.TrajInfoCls, traj_infos_queue=c.traj_infos_queue, max_T=c.eval_max_T, agent=c.get("agent", None), sync=w.get("sync", None), step_buffer_np=w.get("eval_step_buffer_np", None), ) ctrl = c.ctrl ctrl.barrier_out.wait() while True: agent_inputs = collector.reset_if_needed( agent_inputs) # Outside barrier? ctrl.barrier_in.wait() if ctrl.quit.value: break if ctrl.do_eval.value: eval_collector.collect_evaluation( ctrl.itr.value) # Traj_infos to queue inside. else: agent_inputs, traj_infos, completed_infos = collector.collect_batch( agent_inputs, traj_infos, ctrl.itr.value) for info in completed_infos: c.traj_infos_queue.put(info) ctrl.barrier_out.wait() for env in envs + eval_envs: env.close()
def build_par_objs(self, world_size): barrier = mp.Barrier(world_size) traj_infos_queue = mp.Queue() par = AttrDict( barrier=barrier, traj_infos_queue=traj_infos_queue, ) return par
def build_affinities_gpu_1cpu_drive(slt, gpu, cpu, cxg=1, gpr=1, cpw=1, hto=None, skt=1): """OLD. Divides CPUs evenly among GPUs, with one CPU held open for each GPU, to drive it. Workers assigned on the remaining CPUs. Master permitted to use driver core + worker cores (good in case of multi-context per GPU and old alternating action server sampler, from accel_rl). GPU-driving CPUs grouped at the lowest numbered cores of each CPU socket. """ if gpr > 1: raise NotImplementedError # (parallel training) n_ctx = gpu * cxg n_run_slots = n_ctx // gpr assert slt < n_run_slots cpu_per_gpu = cpu // gpu sim_cpu_per_gpu = cpu_per_gpu - 1 n_sim_cpu = cpu - gpu sim_cpu_per_ctx = n_sim_cpu // n_ctx assert gpu >= skt assert gpu % skt == 0 gpu_per_skt = gpu // skt assert cpu % skt == 0 cpu_per_skt = cpu // skt my_ctx = slt # Different for multi-context run, not implemented. my_gpu = my_ctx // cxg my_skt = my_gpu // gpu_per_skt gpu_in_skt = my_gpu % gpu_per_skt gpu_core = gpu_in_skt + my_skt * cpu_per_skt ctx_in_gpu = my_ctx % cxg min_sim_core = (my_skt * cpu_per_skt + gpu_per_skt + gpu_in_skt * sim_cpu_per_gpu + ctx_in_gpu * sim_cpu_per_ctx) sim_cores = tuple(range(min_sim_core, min_sim_core + sim_cpu_per_ctx)) assert len(sim_cores) % cpw == 0 if hto is None: hto = cpu if hto > 0: hyperthreads = tuple(c + hto for c in sim_cores) workers_cpus = tuple(sim_cores[i:i + cpw] + hyperthreads[i:i + cpw] for i in range(0, len(sim_cores), cpw)) master_cpus = (gpu_core,) + sim_cores + (gpu_core + hto,) + hyperthreads else: workers_cpus = tuple(sim_cores[i:i + cpw] for i in range(0, len(sim_cores), cpw)) master_cpus = (gpu_core,) + sim_cores affinity = AttrDict( all_cpus=master_cpus, master_cpus=master_cpus, workers_cpus=workers_cpus, master_torch_threads=1, worker_torch_threads=cpw, cuda_idx=my_gpu, ) return affinity
def initialize(self, affinity): p = psutil.Process() if affinity.get("set_affinity", True): p.cpu_affinity(affinity["master_cpus"]) # torch.set_num_threads(affinity["master_torch_threads"]) torch.set_num_threads(1) # Needed to prevent MKL hang :( . B = self.batch_spec.B envs = [self.EnvCls(**self.env_kwargs) for _ in range(B)] sync = AttrDict( db_idx=AttrDict(value=0)) # Mimic the mp.RawValue format. collector = self.CollectorCls( rank=0, envs=envs, samples_np=self.double_buffer, batch_T=self.batch_spec.T, TrajInfoCls=self.TrajInfoCls, agent=self.agent, sync=sync, ) if self.eval_n_envs > 0: eval_envs = [ self.EnvCls(**self.eval_env_kwargs) for _ in range(self.eval_n_envs) ] eval_CollectorCls = self.eval_CollectorCls or SerialEvalCollector self.eval_collector = eval_CollectorCls( envs=eval_envs, agent=self.agent, TrajInfoCls=self.TrajInfoCls, max_T=self.eval_max_steps // self.eval_n_envs, max_trajectories=self.eval_max_trajectories, ) self.agent.to_device(cuda_idx=affinity.get("cuda_idx", None)) self.agent.async_cpu(share_memory=False) agent_inputs, traj_infos = collector.start_envs( self.max_decorrelation_steps) collector.start_agent() self.collector = collector self.agent_inputs = agent_inputs self.traj_infos = traj_infos self.sync = sync logger.log("Serial sampler initialized.")
def build_par_objs(self, n_runners): barrier = mp.Barrier(n_runners) traj_infos_queue = mp.Queue() mgr = mp.Manager() mgr_dict = mgr.dict() # For any other comms. par = AttrDict( barrier=barrier, traj_infos_queue=traj_infos_queue, dict=mgr_dict, ) return par
def launch_workers(self, double_buffer, traj_infos_queue, affinity, seed, n_envs_list, eval_n_envs_per): n_worker = len(affinity["workers_cpus"]) sync = AttrDict( step_blockers=[mp.Semaphore(0) for _ in range(n_worker)], act_waiters=[mp.Semaphore(0) for _ in range(n_worker)], stop_eval=mp.RawValue(ctypes.c_bool, False), ) step_buffer_pyt, step_buffer_np = build_step_buffer( self.examples, sum(n_envs_list)) if self.eval_n_envs_per > 0: eval_n_envs = self.eval_n_envs_per * n_worker eval_step_buffer_pyt, eval_step_buffer_np = build_step_buffer( self.examples, eval_n_envs) self.eval_step_buffer_pyt = eval_step_buffer_pyt self.eval_step_buffer_np = eval_step_buffer_np else: eval_step_buffer_np = None common_kwargs = dict( EnvCls=self.EnvCls, env_kwargs=self.env_kwargs, agent=None, batch_T=self.batch_spec.T, CollectorCls=self.CollectorCls, TrajInfoCls=self.TrajInfoCls, traj_infos_queue=traj_infos_queue, ctrl=self.ctrl, max_decorrelation_steps=self.max_decorrelation_steps, eval_n_envs=self.eval_n_envs_per, eval_CollectorCls=self.eval_CollectorCls or EvalCollector, eval_env_kwargs=self.eval_env_kwargs, eval_max_T=self.eval_max_T, ) workers_kwargs = assemble_workers_kwargs(affinity, seed, double_buffer, n_envs_list, step_buffer_np, sync, self.eval_n_envs_per, eval_step_buffer_np) workers = [ mp.Process(target=sampling_process, kwargs=dict(common_kwargs=common_kwargs, worker_kwargs=w_kwargs)) for w_kwargs in workers_kwargs ] for w in workers: w.start() self.workers = workers self.step_buffer_pyt = step_buffer_pyt self.step_buffer_np = step_buffer_np self.sync = sync self.mid_batch_reset = self.CollectorCls.mid_batch_reset
def build_ctrl(self, n_optim_runner): opt_throttle = (mp.Barrier(n_optim_runner) if n_optim_runner > 1 else None) return AttrDict( quit=mp.Value('b', lock=True), sample_ready=[mp.Semaphore(0) for _ in range(2)], # Double buffer. sample_copied=[mp.Semaphore(1) for _ in range(2)], sample_itr=mp.Value('l', lock=True), opt_throttle=opt_throttle, eval_time=mp.Value('d', lock=True), )
def build_ctrl(self, world_size): opt_throttle = (mp.Barrier(world_size) if world_size > 1 else None) return AttrDict( quit=mp.Value('b', lock=True), quit_opt=mp.RawValue('b'), sample_ready=[mp.Semaphore(0) for _ in range(2)], # Double buffer. sample_copied=[mp.Semaphore(1) for _ in range(2)], sampler_itr=mp.Value('l', lock=True), opt_throttle=opt_throttle, eval_time=mp.Value('d', lock=True), )
def launch_memcpy(self, sample_buffers, replay_buffer): args_list = list() for i in range(len(sample_buffers)): ctrl = AttrDict( quit=self.ctrl.quit, sample_ready=self.ctrl.sample_ready[i], sample_copied=self.ctrl.sample_copied[i], ) args_list.append((sample_buffers[i], replay_buffer, ctrl)) procs = [mp.Process(target=memory_copier, args=a) for a in args_list] for p in procs: p.start() self.memcpy_procs = procs
def build_par_objs(n, groups=1): ctrl = AttrDict( quit=mp.RawValue(ctypes.c_bool, False), barrier_in=mp.Barrier(n * groups + 1), barrier_out=mp.Barrier(n * groups + 1), do_eval=mp.RawValue(ctypes.c_bool, False), itr=mp.RawValue(ctypes.c_long, 0), ) traj_infos_queue = mp.Queue() step_blockers = [[mp.Semaphore(0) for _ in range(n)] for _ in range(groups)] act_waiters = [[mp.Semaphore(0) for _ in range(n)] for _ in range(groups)] if groups == 1: step_blockers = step_blockers[0] act_waiters = act_waiters[0] sync = AttrDict( step_blockers=step_blockers, act_waiters=act_waiters, stop_eval=mp.RawValue(ctypes.c_bool, False), ) return ctrl, traj_infos_queue, sync
def __init__(self, task, seed=0, headless=True, num_envs=0, episode_length=1000, randomize=False): assert task in VALID_TASKS base_args = AttrDict(BASE_ARGS) # Base args except for cfg base_args.logdir, base_args.cfg_train, base_args.cfg_env = retrieve_cfg( base_args, False) base_args.headless = headless base_args.task = task base_args.seed = seed base_args.episode_length = episode_length base_args.num_envs = num_envs base_args.randomize = randomize base_args.logdir, base_args.cfg_train, base_args.cfg_env = retrieve_cfg( base_args, False) # Update configs properly cfg, cfg_train, logdir = load_cfg(base_args) sim_params = parse_sim_params(base_args, cfg, cfg_train) self.task, self.env = parse_task(base_args, cfg, cfg_train, sim_params) # Create environment self.num_envs = self.env.num_envs # Number of environments self.device = self.env.rl_device # cuda or cpu self._observation_space = IsaacSpaceWrapper( num_envs=self.num_envs, device=self.env.rl_device, space=self.env.observation_space, name="obs", force_float32=True, ) self._action_space = IsaacSpaceWrapper( num_envs=self.num_envs, device=self.env.rl_device, space=self.env.action_space, name="act", force_float32=True, )
def launch_memcpy(self, sample_buffers, replay_buffer): procs = list() for i in range(len(sample_buffers)): # (2 for double-buffer.) ctrl = AttrDict( quit=self.ctrl.quit, sample_ready=self.ctrl.sample_ready[i], sample_copied=self.ctrl.sample_copied[i], ) procs.append(mp.Process(target=memory_copier, args=(sample_buffers[i], self.algo.samples_to_buffer, replay_buffer, ctrl))) for p in procs: p.start() self.memcpy_procs = procs
def _build_parallel_ctrl(self, n_worker): """ 创建用于控制并行训练过程的一些数据结构。 multiprocessing.RawValue:不存在lock的多进程间共享值。 multiprocessing.Barrier:一种简单的同步原语,用于固定数目的进程相互等待。当所有进程都调用wait以后,所有进程会同时开始执行。 multiprocessing.Queue:用于多进程间数据传递的消息队列。 :param n_worker: 真正的worker数(不一定等于用户设置的那个原始值)。 """ self.ctrl = AttrDict( quit=mp.RawValue(ctypes.c_bool, False), barrier_in=mp.Barrier( n_worker + 1), # 需要有n_worker+1个wait()被调用,所有multiprocessing启动的进程才会"解锁" barrier_out=mp.Barrier(n_worker + 1), do_eval=mp.RawValue(ctypes.c_bool, False), itr=mp.RawValue(ctypes.c_long, 0), ) self.traj_infos_queue = mp.Queue() # 多进程间共享的队列 self.eval_traj_infos_queue = mp.Queue() # RawValue(typecode_or_type, *args) 返回从共享内存中分配的ctypes对象,这里为bool类型的对象 self.sync = AttrDict(stop_eval=mp.RawValue(ctypes.c_bool, False))
def launch_workers(self, double_buffer_slice, affinity, seed, n_envs_list): self.n_worker = n_worker = len(n_envs_list) # A little slight-of-hand to make 2-level signal: self.ctrl.stop_eval = self.sync.stop_eval self.sync = AttrDict( obs_ready=[mp.Semaphore(0) for _ in range(n_worker)], act_ready=[mp.Semaphore(0) for _ in range(n_worker)], stop_eval=mp.RawValue(ctypes.c_bool, False), # Overwrite. # stop_eval=self.ctrl.stop_eval, # No, make 2-level signal. db_idx=self.ctrl. db_idx, # Copy into sync which passes to Collector. ) self.step_buffer_pyt, self.step_buffer_np = build_step_buffer( self.examples, sum(n_envs_list)) self.agent_inputs = AgentInputs( self.step_buffer_pyt.observation, self.step_buffer_pyt.action, self.step_buffer_pyt.reward, ) if self.eval_n_envs > 0: eval_n_envs = self.eval_n_envs_per * n_worker eval_step_buffer_pyt, eval_step_buffer_np = build_step_buffer( self.examples, eval_n_envs) self.eval_step_buffer_pyt = eval_step_buffer_pyt self.eval_step_buffer_np = eval_step_buffer_np self.eval_agent_inputs = AgentInputs( self.eval_step_buffer_pyt.observation, self.eval_step_buffer_pyt.action, self.eval_step_buffer_pyt.reward, ) # eval_max_T already made in earlier initialize. self.double_buffer = double_buffer_slice # Now only see my part. common_kwargs = self._assemble_common_kwargs(affinity) common_kwargs["agent"] = None # Remove. workers_kwargs = self._assemble_workers_kwargs(affinity, seed, n_envs_list) # Yes, fork again. self.workers = [ mp.Process( target=sampling_process, kwargs=dict(common_kwargs=common_kwargs, worker_kwargs=w_kwargs), ) for w_kwargs in workers_kwargs ] for w in self.workers: w.start()
def build_ctrl(self, world_size): """ Builds several parallel communication mechanisms for controlling the workflow across processes. """ opt_throttle = (mp.Barrier(world_size) if world_size > 1 else None) return AttrDict( quit=mp.Value('b', lock=True), quit_opt=mp.RawValue('b'), sample_ready=[mp.Semaphore(0) for _ in range(2)], # Double buffer. sample_copied=[mp.Semaphore(1) for _ in range(2)], sampler_itr=mp.Value('l', lock=True), opt_throttle=opt_throttle, eval_time=mp.Value('d', lock=True), )
def build_cpu_affinity(slt, cpu, cpr, cpw=1, hto=None, res=0, skt=1, gpu=0, alt=0, saf=1): assert gpu == 0 assert cpu % cpr == 0 hto = cpu if hto is None else hto # Default is None, 0 is OFF. # print(f"build_cpu_affinity - hto: {hto}, cpu: {cpu}, skt: {skt}") # print(f"Is this 0? {(hto - cpu) % skt}") assert (hto - cpu) % skt == 0 n_run_slots = cpu // cpr assert slt <= n_run_slots cpu_per_skt = max(cpu, hto) // skt if n_run_slots >= skt: slt_per_skt = n_run_slots // skt my_skt = slt // slt_per_skt slt_in_skt = slt % slt_per_skt min_core = my_skt * cpu_per_skt + slt_in_skt * cpr cores = tuple(range(min_core, min_core + cpr)) else: # One run multiple sockets. skt_per_slt = skt // n_run_slots cores = list() low_skt = slt * skt_per_slt for s in range(skt_per_slt): min_core = (low_skt + s) * cpu_per_skt high_core = min_core + cpr // skt_per_slt cores.extend(list(range(min_core, high_core))) cores = tuple(cores) worker_cores = cores[res:] assert len(worker_cores) % cpw == 0 master_cpus = get_master_cpus(cores, hto) workers_cpus = get_workers_cpus(worker_cores, cpw, hto, alt) affinity = AttrDict( all_cpus=master_cpus, master_cpus=master_cpus, workers_cpus=workers_cpus, master_torch_threads=len(cores), worker_torch_threads=cpw, alternating=bool(alt), # Just to pass through a check. set_affinity=bool(saf), ) return affinity
def sample_runner_initialize(self, affinity): n_server = len(affinity) n_worker = sum(len(aff["workers_cpus"]) for aff in affinity) n_envs_list = [self.batch_spec.B // n_worker] * n_worker if not self.batch_spec.B % n_worker == 0: logger.log( "WARNING: unequal number of envs per process, from " f"batch_B {self.batch_spec.B} and n_parallel {n_worker} " "(possible suboptimal speed).") for b in range(self.batch_spec.B % n_worker): n_envs_list[b] += 1 if self.eval_n_envs > 0: eval_n_envs_per = max(1, self.eval_n_envs // len(n_envs_list)) eval_n_envs = eval_n_envs_per * n_worker logger.log(f"Total parallel evaluation envs: {eval_n_envs}.") self.eval_max_T = 1 + int(self.eval_max_steps // eval_n_envs) self.eval_n_envs_per = eval_n_envs_per else: self.eval_n_envs_per = 0 self.eval_max_T = 0 ctrl = AttrDict( quit=mp.RawValue(ctypes.c_bool, False), barrier_in=mp.Barrier(n_server + n_worker + 1), barrier_out=mp.Barrier(n_server + n_worker + 1), do_eval=mp.RawValue(ctypes.c_bool, False), itr=mp.RawValue(ctypes.c_long, 0), ) traj_infos_queue = mp.Queue() common_kwargs = dict( ctrl=ctrl, traj_infos_queue=traj_infos_queue, ) servers_kwargs = assemble_servers_kwargs(affinity, n_envs_list, self.seed, self.double_buffer) servers = [ mp.Process(target=self.action_server_process, kwargs=s_kwargs.update(**common_kwargs)) for s_kwargs in servers_kwargs ] for s in servers: s.start() self.servers = servers self.ctrl = ctrl self.traj_infos_queue = traj_infos_queue
def reset_dones(self, done): l_A, r_A, nr_A, dr_A, _cd_A = self.Length[done], self.Return[ done], self.NonzeroRewards[done], self.DiscountedReturn[ done], self._cur_discount[done] completed_infos = [ AttrDict(Length=l, Return=r, NonzeroRewards=nr, DiscountedReturn=dr, _cur_discount=_cd, _discount=self._discount) for l, r, nr, dr, _cd in zip(l_A, r_A, nr_A, dr_A, _cd_A) ] self.Length[done], self.Return[done], self.NonzeroRewards[ done], self.DiscountedReturn[done], self._cur_discount[ done] = 0, 0., 0., 0., 1. return completed_infos
def _assemble_workers_kwargs(self, affinity, seed, n_envs_list): workers_kwargs = super()._assemble_workers_kwargs( affinity, seed, n_envs_list) i_env = 0 for rank, w_kwargs in enumerate(workers_kwargs): n_envs = n_envs_list[rank] slice_B = slice(i_env, i_env + n_envs) w_kwargs["sync"] = AttrDict( stop_eval=self.sync.stop_eval, obs_ready=self.sync.obs_ready[rank], act_ready=self.sync.act_ready[rank], ) w_kwargs["step_buffer_np"] = self.step_buffer_np[slice_B] if self.eval_n_envs > 0: eval_slice_B = slice(self.eval_n_envs_per * rank, self.eval_n_envs_per * (rank + 1)) w_kwargs["eval_step_buffer_np"] = \ self.eval_step_buffer_np[eval_slice_B] i_env += n_envs return workers_kwargs
def build_and_train(): opt_affinities = list() opt_affinity = dict(cpus=[0], cuda_idx=None, torch_threads=1, set_affinity=True) opt_affinities.append(opt_affinity) smp_affinity = AttrDict( all_cpus=[0, 1], master_cpus=[0], workers_cpus=[1], master_torch_threads=1, worker_torch_threads=1, cuda_idx=None, alternating=False, # Just to pass through a check. set_affinity=True, ) affinity = AttrDict( all_cpus=[0, 1], # For exp launcher to use taskset. optimizer=opt_affinities, sampler=smp_affinity, set_affinity=True, ) sampler = AsyncCpuSampler(EnvCls=_make_env, env_kwargs=dict(rank=0), batch_T=600, batch_B=3, max_decorrelation_steps=0, CollectorCls=DbCpuResetCollector) algo = SAC(batch_size=256, min_steps_learn=10000, replay_size=1000000, replay_ratio=1, target_update_interval=1, target_entropy=-9, target_update_tau=0.01, learning_rate=0.00025, action_prior="uniform", reward_scale=1, reparameterize=True, clip_grad_norm=1e9, n_step_return=1, updates_per_sync=1, bootstrap_timelimit=False) # Run with defaults. agent = SacAgent(model_kwargs={'hidden_sizes': [256, 256]}) runner = AsyncRl( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=10000, affinity=affinity, ) config = dict(env_id='picking') name = "sac_rlpyt_picking" log_dir = os.path.join(os.path.dirname(__file__), "sac_rlpyt_picking") with logger_context(log_dir, 0, name, config, use_summary_writer=False, snapshot_mode='all'): runner.train()
def _mux_sampler(common_kwargs, worker_kwargs): """Variant of `rlpyt.samplers.parallel.worker.sampling_process` that is able to supply different environment keyword arguments to each environment that makes up a batch.""" c, w = AttrDict(**common_kwargs), AttrDict(**worker_kwargs) initialize_worker(w.rank, w.seed, w.cpus, c.torch_threads) # vvv CHANGED LINES vvv if isinstance(c.env_kwargs, (list, tuple)): env_ranks = w["env_ranks"] envs = [c.EnvCls(**c.env_kwargs[rank]) for rank in env_ranks] else: envs = [c.EnvCls(**c.env_kwargs) for _ in range(w.n_envs)] # ^^^ CHANGED LINES ^^^ collector = c.CollectorCls( rank=w.rank, envs=envs, samples_np=w.samples_np, batch_T=c.batch_T, TrajInfoCls=c.TrajInfoCls, agent=c.get("agent", None), # Optional depending on parallel setup. sync=w.get("sync", None), step_buffer_np=w.get("step_buffer_np", None), global_B=c.get("global_B", 1), env_ranks=w.get("env_ranks", None), ) agent_inputs, traj_infos = collector.start_envs(c.max_decorrelation_steps) collector.start_agent() if c.get("eval_n_envs", 0) > 0: eval_envs = [ c.EnvCls(**c.eval_env_kwargs) for _ in range(c.eval_n_envs) ] eval_collector = c.eval_CollectorCls( rank=w.rank, envs=eval_envs, TrajInfoCls=c.TrajInfoCls, traj_infos_queue=c.eval_traj_infos_queue, max_T=c.eval_max_T, agent=c.get("agent", None), sync=w.get("sync", None), step_buffer_np=w.get("eval_step_buffer_np", None), ) else: eval_envs = list() ctrl = c.ctrl ctrl.barrier_out.wait() while True: collector.reset_if_needed(agent_inputs) # Outside barrier? ctrl.barrier_in.wait() if ctrl.quit.value: break if ctrl.do_eval.value: eval_collector.collect_evaluation( ctrl.itr.value) # Traj_infos to queue inside. else: (agent_inputs, traj_infos, completed_infos) = collector.collect_batch( agent_inputs, traj_infos, ctrl.itr.value) for info in completed_infos: c.traj_infos_queue.put(info) ctrl.barrier_out.wait() for env in envs + eval_envs: env.close()
def build_async_affinity(run_slot, gpu, cpu, gpr=1, sgr=0, oss=0, cpw=1, hto=None, res=1, skt=1, alt=0, saf=1): oss = bool(oss) sgr = gpr if oss else sgr total_gpr = (gpr + sgr * (not oss)) n_run_slots = gpu // total_gpr assert run_slot < n_run_slots cpr = cpu // n_run_slots smp_cpr = cpr - res * gpr gpu_per_skt = gpu // skt hto = cpu if hto is None else hto # Default is None, 0 is OFF. cpu_per_skt = max(cpu, hto) // skt opt_affinities = list() smp_affinities = list() all_cpus = tuple() if total_gpr <= gpu_per_skt: run_per_skt = n_run_slots // skt assert n_run_slots % skt == 0 # Relax later? skt_per_run = 1 run_in_skt = run_slot % run_per_skt my_skt = run_slot // run_per_skt low_opt_gpu = my_skt * gpu_per_skt + run_in_skt * total_gpr high_opt_gpu = low_opt_gpu + gpr my_opt_gpus = list(range(low_opt_gpu, high_opt_gpu)) my_smp_gpus = (my_opt_gpus if oss else list( range(high_opt_gpu, high_opt_gpu + sgr))) else: # One run takes more than one socket: spread opt gpus across sockets. skt_per_run = skt // n_run_slots low_skt = run_slot * skt_per_run assert gpr % skt_per_run == 0, "Maybe try n_socket=1." assert sgr % skt_per_run == 0, "Maybe try n_socket=1." my_opt_gpus = list() my_smp_gpus = list() run_in_skt = run_per_skt = 0 for s in range(skt_per_run): low_opt_gpu = (low_skt + s) * gpu_per_skt high_opt_gpu = low_opt_gpu + gpr // skt_per_run my_opt_gpus.extend(list(range(low_opt_gpu, high_opt_gpu))) if oss: my_smp_gpus = my_opt_gpus else: high_smp_gpu = high_opt_gpu + sgr // skt_per_run my_smp_gpus.extend(list(range(high_opt_gpu, high_smp_gpu))) for i, opt_gpu in enumerate(my_opt_gpus): gpu_in_skt = opt_gpu % gpu_per_skt gpu_skt = opt_gpu // gpu_per_skt gpu_res = i if run_per_skt >= 1 else gpu_in_skt low_opt_core = (gpu_skt * cpu_per_skt + run_in_skt * cpr + gpu_res * res) high_opt_core = low_opt_core + res opt_cores = tuple(range(low_opt_core, high_opt_core)) opt_cpus = get_master_cpus(opt_cores, hto) opt_affinity = dict(cpus=opt_cpus, cuda_idx=opt_gpu, torch_threads=len(opt_cores), set_affinity=bool(saf)) opt_affinities.append(opt_affinity) all_cpus += opt_cpus wrkr_per_smp = smp_cpr // cpw smp_cpr = wrkr_per_smp * cpw smp_cpg = smp_cpr // max(1, sgr) for i, smp_gpu in enumerate(my_smp_gpus): gpu_skt = smp_gpu // gpu_per_skt gpu_in_skt = smp_gpu % gpu_per_skt smp_cpu_off = (i if run_per_skt >= 1 else gpu_in_skt - (gpr // skt_per_run)) low_smp_core = (gpu_skt * cpu_per_skt + run_in_skt * cpr + (gpr // skt_per_run) * res + smp_cpu_off * smp_cpg) high_smp_core = low_smp_core + smp_cpg master_cores = tuple(range(low_smp_core, high_smp_core)) master_cpus = get_master_cpus(master_cores, hto) workers_cpus = get_workers_cpus(master_cores, cpw, hto, alt) smp_affinity = AttrDict( all_cpus=master_cpus, master_cpus=master_cpus, workers_cpus=workers_cpus, master_torch_threads=len(master_cores), worker_torch_threads=cpw, cuda_idx=smp_gpu, alternating=bool(alt), # Just to pass through a check. set_affinity=bool(saf), ) smp_affinities.append(smp_affinity) all_cpus += master_cpus if not smp_affinities: # sgr==0; CPU sampler. if total_gpr <= gpu_per_skt: low_smp_core = (my_skt * cpu_per_skt + run_in_skt * cpr + gpr * res) master_cores = tuple(range(low_smp_core, low_smp_core + smp_cpr)) else: master_cores = tuple() for s in range(skt_per_run): low_smp_core = ((low_skt + s) * cpu_per_skt + (gpr // gpu_per_skt) * res) master_cores += tuple( range(low_smp_core, low_smp_core + smp_cpr // skt_per_run)) master_cpus = get_master_cpus(master_cores, hto) workers_cpus = get_workers_cpus(master_cores, cpw, hto, alt) smp_affinities = AttrDict( all_cpus=master_cpus, master_cpus=master_cpus, workers_cpus=workers_cpus, master_torch_threads=len(master_cores), worker_torch_threads=cpw, cuda_idx=None, alternating=bool(alt), # Just to pass through a check. set_affinity=bool(saf), ) all_cpus += master_cpus affinity = AttrDict( all_cpus=all_cpus, # For exp launcher to use taskset. optimizer=opt_affinities, sampler=smp_affinities, set_affinity=bool(saf), ) return affinity
def sampling_process(common_kwargs, worker_kwargs): """Target function used for forking parallel worker processes in the samplers. After ``initialize_worker()``, it creates the specified number of environment instances and gives them to the collector when instantiating it. It then calls collector startup methods for environments and agent. If applicable, instantiates evaluation environment instances and evaluation collector. Then enters infinite loop, waiting for signals from master to collect training samples or else run evaluation, until signaled to exit. """ c, w = AttrDict(**common_kwargs), AttrDict(**worker_kwargs) initialize_worker(w.rank, w.seed, w.cpus, c.torch_threads) envs = [c.EnvCls(**c.env_kwargs) for _ in range(w.n_envs)] log_heatmaps = c.env_kwargs.get('log_heatmaps', None) if log_heatmaps is not None and log_heatmaps == True: for env in envs[1:]: env.log_heatmaps = False if c.record_freq > 0: if c.env_kwargs['game'] in ATARI_ENVS: envs[0].record_env = True os.makedirs(os.path.join(c.log_dir, 'videos/frames')) elif c.get( "eval_n_envs", 0 ) == 0: # only record workers if no evaluation processes are performed envs[0] = Monitor(envs[0], c.log_dir + '/videos', video_callable=lambda episode_id: episode_id % c. record_freq == 0) set_envs_seeds(envs, w.seed) collector = c.CollectorCls( rank=w.rank, envs=envs, samples_np=w.samples_np, batch_T=c.batch_T, TrajInfoCls=c.TrajInfoCls, agent=c.get("agent", None), # Optional depending on parallel setup. sync=w.get("sync", None), step_buffer_np=w.get("step_buffer_np", None), global_B=c.get("global_B", 1), env_ranks=w.get("env_ranks", None), no_extrinsic=c.no_extrinsic) agent_inputs, traj_infos = collector.start_envs(c.max_decorrelation_steps) collector.start_agent() if c.get("eval_n_envs", 0) > 0: eval_envs = [ c.EnvCls(**c.eval_env_kwargs) for _ in range(c.eval_n_envs) ] if c.record_freq > 0: eval_envs[0] = Monitor(eval_envs[0], c.log_dir + '/videos', video_callable=lambda episode_id: episode_id % c.record_freq == 0) set_envs_seeds(eval_envs, w.seed) eval_collector = c.eval_CollectorCls( rank=w.rank, envs=eval_envs, TrajInfoCls=c.TrajInfoCls, traj_infos_queue=c.eval_traj_infos_queue, max_T=c.eval_max_T, agent=c.get("agent", None), sync=w.get("sync", None), step_buffer_np=w.get("eval_step_buffer_np", None), ) else: eval_envs = list() ctrl = c.ctrl ctrl.barrier_out.wait() while True: collector.reset_if_needed(agent_inputs) # Outside barrier? ctrl.barrier_in.wait() if ctrl.quit.value: logger.log('Quitting worker ...') break if ctrl.do_eval.value: eval_collector.collect_evaluation( ctrl.itr.value) # Traj_infos to queue inside. else: agent_inputs, traj_infos, completed_infos = collector.collect_batch( agent_inputs, traj_infos, ctrl.itr.value) for info in completed_infos: c.traj_infos_queue.put(info) ctrl.barrier_out.wait() for env in envs + eval_envs: logger.log('Stopping env ...') env.close()
def sampling_process(common_kwargs, worker_kwargs): """Target function used for forking parallel worker processes in the samplers. After ``initialize_worker()``, it creates the specified number of environment instances and gives them to the collector when instantiating it. It then calls collector startup methods for environments and agent. If applicable, instantiates evaluation environment instances and evaluation collector. Then enters infinite loop, waiting for signals from master to collect training samples or else run evaluation, until signaled to exit. """ c, w = AttrDict(**common_kwargs), AttrDict(**worker_kwargs) initialize_worker(w.rank, w.seed, w.cpus, c.torch_threads) envs = [c.EnvCls(**c.env_kwargs) for _ in range(w.n_envs)] set_envs_seeds(envs, w.seed) collector = c.CollectorCls( rank=w.rank, envs=envs, samples_np=w.samples_np, batch_T=c.batch_T, TrajInfoCls=c.TrajInfoCls, agent=c.get("agent", None), # Optional depending on parallel setup. sync=w.get("sync", None), step_buffer_np=w.get("step_buffer_np", None), global_B=c.get("global_B", 1), env_ranks=w.get("env_ranks", None), ) agent_inputs, traj_infos = collector.start_envs(c.max_decorrelation_steps) collector.start_agent() if c.get("eval_n_envs", 0) > 0: eval_envs = [ c.EnvCls(**c.eval_env_kwargs) for _ in range(c.eval_n_envs) ] set_envs_seeds(eval_envs, w.seed) eval_collector = c.eval_CollectorCls( rank=w.rank, envs=eval_envs, TrajInfoCls=c.TrajInfoCls, traj_infos_queue=c.eval_traj_infos_queue, max_T=c.eval_max_T, agent=c.get("agent", None), sync=w.get("sync", None), step_buffer_np=w.get("eval_step_buffer_np", None), ) else: eval_envs = list() ctrl = c.ctrl ctrl.barrier_out.wait() while True: collector.reset_if_needed(agent_inputs) # Outside barrier? ctrl.barrier_in.wait() if ctrl.quit.value: break if ctrl.do_eval.value: # Traj_infos to queue inside. eval_collector.collect_evaluation(ctrl.itr.value) else: agent_inputs, traj_infos, completed_infos = collector.collect_batch( agent_inputs, traj_infos, ctrl.itr.value) for info in completed_infos: c.traj_infos_queue.put(info) ctrl.barrier_out.wait() for env in envs + eval_envs: env.close()
def sampling_process(common_kwargs, worker_kwargs): """ Arguments fed from the Sampler class in master process. 采样进程函数。 :param common_kwargs: 各个worker通用的参数列表。 :param worker_kwargs: 各个worker可能不同的参数列表。 """ c, w = AttrDict(**common_kwargs), AttrDict(**worker_kwargs) initialize_worker(w.rank, w.seed, w.cpus, c.torch_threads) # 初始化用于training的environment实例和collector实例 envs = [c.EnvCls(**c.env_kwargs) for _ in range(w.n_envs)] collector = c.CollectorCls( rank=w.rank, envs=envs, samples_np=w.samples_np, batch_T=c.batch_T, TrajInfoCls=c.TrajInfoCls, agent=c.get("agent", None), # Optional depending on parallel setup. sync=w.get("sync", None), step_buffer_np=w.get("step_buffer_np", None), global_B=c.get("global_B", 1), env_ranks=w.get("env_ranks", None), ) agent_inputs, traj_infos = collector.start_envs( c.max_decorrelation_steps) # 这里会做收集(采样)第一批数据的工作 collector.start_agent() # collector的初始化 # 初始化用于evaluation的environment实例和collector实例 if c.get("eval_n_envs", 0) > 0: eval_envs = [ c.EnvCls(**c.eval_env_kwargs) for _ in range(c.eval_n_envs) ] eval_collector = c.eval_CollectorCls( rank=w.rank, envs=eval_envs, TrajInfoCls=c.TrajInfoCls, traj_infos_queue=c.eval_traj_infos_queue, max_T=c.eval_max_T, agent=c.get("agent", None), sync=w.get("sync", None), step_buffer_np=w.get("eval_step_buffer_np", None), ) else: eval_envs = list() ctrl = c.ctrl # 用于控制多个worker进程同时运行时能正确运作的控制器 ctrl.barrier_out.wait( ) # 每个worker都有一个wait(),加上ParallelSamplerBase.initialize()中的一个wait(),刚好n_worker+1个 while True: collector.reset_if_needed(agent_inputs) # Outside barrier? ctrl.barrier_in.wait() if ctrl.quit.value: # 在主进程中set了这个值为True时,所有worker进程会退出采样 break if ctrl.do_eval.value: # 在主进程的evaluate_agent()函数里set了这个值为True时,这里才会收集evaluation用的数据 eval_collector.collect_evaluation( ctrl.itr.value) # Traj_infos to queue inside. else: # 不是做evaluation agent_inputs, traj_infos, completed_infos = collector.collect_batch( agent_inputs, traj_infos, ctrl.itr.value) for info in completed_infos: c.traj_infos_queue.put(info) # 向所有worker进程共享的队列塞入当前worker的统计数据 ctrl.barrier_out.wait() # 清理environment for env in envs + eval_envs: env.close()