def init_workers(self): """ Initialize all types of workers and start their worker processes. """ actor_queues = [faster_fifo.Queue() for _ in range(self.cfg.num_workers)] policy_worker_queues = dict() for policy_id in range(self.cfg.num_policies): policy_worker_queues[policy_id] = [] for i in range(self.cfg.policy_workers_per_policy): policy_worker_queues[policy_id].append(TorchJoinableQueue()) log.info('Initializing learners...') policy_locks = [multiprocessing.Lock() for _ in range(self.cfg.num_policies)] resume_experience_collection_cv = [multiprocessing.Condition() for _ in range(self.cfg.num_policies)] learner_idx = 0 for policy_id in range(self.cfg.num_policies): learner_worker = LearnerWorker( learner_idx, policy_id, self.cfg, self.obs_space, self.action_space, self.report_queue, policy_worker_queues[policy_id], self.traj_buffers, policy_locks[policy_id], resume_experience_collection_cv[policy_id], ) learner_worker.start_process() learner_worker.init() self.learner_workers[policy_id] = learner_worker learner_idx += 1 log.info('Initializing policy workers...') for policy_id in range(self.cfg.num_policies): self.policy_workers[policy_id] = [] policy_queue = faster_fifo.Queue() self.policy_queues[policy_id] = policy_queue for i in range(self.cfg.policy_workers_per_policy): policy_worker = PolicyWorker( i, policy_id, self.cfg, self.obs_space, self.action_space, self.traj_buffers, policy_queue, actor_queues, self.report_queue, policy_worker_queues[policy_id][i], policy_locks[policy_id], resume_experience_collection_cv[policy_id], ) self.policy_workers[policy_id].append(policy_worker) policy_worker.start_process() log.info('Initializing actors...') # We support actor worker initialization in groups, which can be useful for some envs that # e.g. crash when too many environments are being initialized in parallel. # Currently the limit is not used since it is not required for any envs supported out of the box, # so we parallelize initialization as hard as we can. # If this is required for your environment, perhaps a better solution would be to use global locks, # like FileLock (see doom_gym.py) self.actor_workers = [] max_parallel_init = int(1e9) # might be useful to limit this for some envs worker_indices = list(range(self.cfg.num_workers)) for i in range(0, self.cfg.num_workers, max_parallel_init): workers = self.init_subset(worker_indices[i:i + max_parallel_init], actor_queues) self.actor_workers.extend(workers)
def __init__(self, player_id, make_env_func, env_config, use_multiprocessing=False, reset_on_init=True): self.player_id = player_id self.make_env_func = make_env_func self.env_config = env_config self.reset_on_init = reset_on_init if use_multiprocessing: self.process = Process(target=self.start, daemon=False) self.task_queue, self.result_queue = faster_fifo.Queue( ), faster_fifo.Queue() else: self.process = threading.Thread(target=self.start) self.task_queue, self.result_queue = Queue(), Queue() self.process.start()
def __init__( self, worker_idx, policy_id, cfg, obs_space, action_space, report_queue, policy_worker_queues, shared_buffers, policy_lock, resume_experience_collection_cv, ): log.info('Initializing the learner %d for policy %d', worker_idx, policy_id) self.worker_idx = worker_idx self.policy_id = policy_id self.cfg = cfg # PBT-related stuff self.should_save_model = True # set to true if we need to save the model to disk on the next training iteration self.load_policy_id = None # non-None when we need to replace our parameters with another policy's parameters self.pbt_mutex = threading.Lock() self.new_cfg = None # non-None when we need to update the learning hyperparameters self.terminate = False self.obs_space = obs_space self.action_space = action_space self.rollout_tensors = shared_buffers.tensor_trajectories self.traj_tensors_available = shared_buffers.is_traj_tensor_available self.policy_versions = shared_buffers.policy_versions self.stop_experience_collection = shared_buffers.stop_experience_collection self.device = None self.actor_critic = None self.optimizer = None self.policy_lock = policy_lock self.resume_experience_collection_cv = resume_experience_collection_cv self.task_queue = faster_fifo.Queue() self.report_queue = report_queue self.initialized_event = MultiprocessingEvent() self.initialized_event.clear() self.model_saved_event = MultiprocessingEvent() self.model_saved_event.clear() # queues corresponding to policy workers using the same policy # we send weight updates via these queues self.policy_worker_queues = policy_worker_queues self.experience_buffer_queue = Queue() self.tensor_batch_pool = ObjectPool() self.tensor_batcher = TensorBatcher(self.tensor_batch_pool) self.with_training = True # set to False for debugging no-training regime self.train_in_background = self.cfg.train_in_background_thread # set to False for debugging self.training_thread = Thread( target=self._train_loop) if self.train_in_background else None self.train_thread_initialized = threading.Event() self.is_training = False self.train_step = self.env_steps = 0 # decay rate at which summaries are collected # save summaries every 20 seconds in the beginning, but decay to every 4 minutes in the limit, because we # do not need frequent summaries for longer experiments self.summary_rate_decay_seconds = LinearDecay([(0, 20), (100000, 120), (1000000, 240)]) self.last_summary_time = 0 self.last_saved_time = self.last_milestone_time = 0 self.discarded_experience_over_time = deque([], maxlen=30) self.discarded_experience_timer = time.time() self.num_discarded_rollouts = 0 self.process = Process(target=self._run, daemon=True)
def __init__(self, cfg): super().__init__(cfg) tmp_env = make_env_func(self.cfg, env_config=None) self.obs_space = tmp_env.observation_space self.action_space = tmp_env.action_space self.num_agents = tmp_env.num_agents self.reward_shaping_scheme = None if self.cfg.with_pbt: if hasattr(tmp_env.unwrapped, '_reward_shaping_wrapper'): # noinspection PyProtectedMember self.reward_shaping_scheme = tmp_env.unwrapped._reward_shaping_wrapper.reward_shaping_scheme else: try: from envs.doom.multiplayer.doom_multiagent_wrapper import MultiAgentEnv if isinstance(tmp_env.unwrapped, MultiAgentEnv): self.reward_shaping_scheme = tmp_env.unwrapped.default_reward_shaping except ImportError: pass tmp_env.close() # shared memory allocation self.traj_buffers = SharedBuffers(self.cfg, self.num_agents, self.obs_space, self.action_space) self.actor_workers = None self.report_queue = faster_fifo.Queue(20 * 1000 * 1000) self.policy_workers = dict() self.policy_queues = dict() self.learner_workers = dict() self.workers_by_handle = None self.policy_inputs = [[] for _ in range(self.cfg.num_policies)] self.policy_outputs = dict() for worker_idx in range(self.cfg.num_workers): for split_idx in range(self.cfg.worker_num_splits): self.policy_outputs[(worker_idx, split_idx)] = dict() self.policy_avg_stats = dict() self.policy_lag = [dict() for _ in range(self.cfg.num_policies)] self.last_timing = dict() self.env_steps = dict() self.samples_collected = [0 for _ in range(self.cfg.num_policies)] self.total_env_steps_since_resume = 0 # currently this applies only to the current run, not experiment as a whole # to change this behavior we'd need to save the state of the main loop to a filesystem self.total_train_seconds = 0 self.last_report = time.time() self.last_experiment_summaries = 0 self.report_interval = 5.0 # sec self.experiment_summaries_interval = self.cfg.experiment_summaries_interval # sec self.avg_stats_intervals = (2, 12, 60) # 10 seconds, 1 minute, 5 minutes self.fps_stats = deque([], maxlen=max(self.avg_stats_intervals)) self.throughput_stats = [deque([], maxlen=5) for _ in range(self.cfg.num_policies)] self.avg_stats = dict() self.stats = dict() # regular (non-averaged) stats self.writers = dict() writer_keys = list(range(self.cfg.num_policies)) for key in writer_keys: summary_dir = join(summaries_dir(experiment_dir(cfg=self.cfg)), str(key)) summary_dir = ensure_dir_exists(summary_dir) self.writers[key] = SummaryWriter(summary_dir, flush_secs=20) self.pbt = PopulationBasedTraining(self.cfg, self.reward_shaping_scheme, self.writers)
def init_workers(self): actor_queues = [ faster_fifo.Queue() for _ in range(self.cfg.num_workers) ] policy_worker_queues = dict() for policy_id in range(self.cfg.num_policies): policy_worker_queues[policy_id] = [] for i in range(self.cfg.policy_workers_per_policy): policy_worker_queues[policy_id].append(TorchJoinableQueue()) log.info('Initializing learners...') policy_locks = [ multiprocessing.Lock() for _ in range(self.cfg.num_policies) ] resume_experience_collection_cv = [ multiprocessing.Condition() for _ in range(self.cfg.num_policies) ] learner_idx = 0 for policy_id in range(self.cfg.num_policies): learner_worker = LearnerWorker( learner_idx, policy_id, self.cfg, self.obs_space, self.action_space, self.report_queue, policy_worker_queues[policy_id], self.traj_buffers, policy_locks[policy_id], resume_experience_collection_cv[policy_id], ) learner_worker.start_process() learner_worker.init() self.learner_workers[policy_id] = learner_worker learner_idx += 1 log.info('Initializing policy workers...') for policy_id in range(self.cfg.num_policies): self.policy_workers[policy_id] = [] policy_queue = faster_fifo.Queue() self.policy_queues[policy_id] = policy_queue for i in range(self.cfg.policy_workers_per_policy): policy_worker = PolicyWorker( i, policy_id, self.cfg, self.obs_space, self.action_space, self.traj_buffers, policy_queue, actor_queues, self.report_queue, policy_worker_queues[policy_id][i], policy_locks[policy_id], resume_experience_collection_cv[policy_id], ) self.policy_workers[policy_id].append(policy_worker) policy_worker.start_process() log.info('Initializing actors...') self.actor_workers = [] max_parallel_init = int( 1e9) # might be useful to limit this for some envs worker_indices = list(range(self.cfg.num_workers)) for i in range(0, self.cfg.num_workers, max_parallel_init): workers = self.init_subset(worker_indices[i:i + max_parallel_init], actor_queues) self.actor_workers.extend(workers)
def __init__(self, cfg, num_agents, obs_space, action_space): self.cfg = cfg self.num_agents = num_agents self.envs_per_split = cfg.num_envs_per_worker // cfg.worker_num_splits self.num_traj_buffers = self.calc_num_trajectory_buffers() num_actions = calc_num_actions(action_space) num_action_logits = calc_num_logits(action_space) hidden_size = get_hidden_size(self.cfg) log.debug('Allocating shared memory for trajectories') self._tensors = TensorDict() # policy inputs obs_dict = TensorDict() self._tensors['obs'] = obs_dict if isinstance(obs_space, spaces.Dict): for space_name, space in obs_space.spaces.items(): obs_dict[space_name] = self.init_tensor( space.dtype, space.shape) else: raise Exception('Only Dict observations spaces are supported') # env outputs self._tensors['rewards'] = self.init_tensor(torch.float32, [1]) self._tensors['rewards'].fill_( -42.42) # if we're using uninitialized values it will be obvious self._tensors['dones'] = self.init_tensor(torch.bool, [1]) self._tensors['dones'].fill_(True) self._tensors['policy_id'] = self.init_tensor(torch.int, [1]) self._tensors['policy_id'].fill_( -1 ) # -1 is an invalid policy index, experience from policy "-1" is always ignored # policy outputs policy_outputs = [('actions', num_actions), ('action_logits', num_action_logits), ('log_prob_actions', 1), ('values', 1), ('policy_version', 1), ('rnn_states', hidden_size)] policy_outputs = [PolicyOutput(*po) for po in policy_outputs] policy_outputs = sorted(policy_outputs, key=lambda policy_output: policy_output.name) for po in policy_outputs: self._tensors[po.name] = self.init_tensor(torch.float32, [po.size]) ensure_memory_shared(self._tensors) # this is for performance optimization # indexing in numpy arrays is faster than in PyTorch tensors self.tensors = self.tensor_dict_to_numpy() # copying small policy outputs (e.g. individual value predictions & action logits) to shared memory is a # bottleneck on the policy worker. For optimization purposes we create additional tensors to hold # just concatenated policy outputs. Rollout workers parse the data and add it to the trajectory buffers # in a proper format policy_outputs_combined_size = sum(po.size for po in policy_outputs) policy_outputs_shape = [ self.cfg.num_workers, self.cfg.worker_num_splits, self.envs_per_split, self.num_agents, policy_outputs_combined_size, ] self.policy_outputs = policy_outputs self._policy_output_tensors = torch.zeros(policy_outputs_shape, dtype=torch.float32) self._policy_output_tensors.share_memory_() self.policy_output_tensors = self._policy_output_tensors.numpy() self._policy_versions = torch.zeros([self.cfg.num_policies], dtype=torch.int32) self._policy_versions.share_memory_() self.policy_versions = self._policy_versions.numpy() # a list of boolean flags to be shared among components that indicate that experience collection should be # temporarily stopped (e.g. due to too much experience accumulated on the learner) self._stop_experience_collection = torch.ones([self.cfg.num_policies], dtype=torch.bool) self._stop_experience_collection.share_memory_() self.stop_experience_collection = self._stop_experience_collection.numpy( ) queue_max_size_bytes = self.num_traj_buffers * 40 # 40 bytes to encode an int should be enough? self.free_buffers_queue = faster_fifo.Queue( max_size_bytes=queue_max_size_bytes) # since all buffers are initially free, we add all buffer indices to the queue self.free_buffers_queue.put_many_nowait( [int(i) for i in np.arange(self.num_traj_buffers)])