def _add_shortcuts(self, m, pairwise_distances): self.remove_shortcuts(m) # first - remove all existing shortcuts shortcuts = self._shortcuts_distance( m, pairwise_distances, self.params.min_shortcut_dist, self.params.shortcut_window, ) if len(shortcuts) <= 0: log.warning('Could not find any shortcuts') return random.shuffle(shortcuts) shortcut_risks = [s[0] for s in shortcuts] shortcuts_to_keep = int(self.params.shortcuts_to_keep_fraction * m.num_landmarks()) keep = min(shortcuts_to_keep, len(shortcuts)) percentile = (keep / len(shortcuts)) * 100 max_risk = np.percentile(shortcut_risks, percentile) max_risk = min(max_risk, self.params.shortcut_risk_threshold) log.debug('Keep shortcuts with risk <= %.3f...', max_risk) shortcuts = [s for s in shortcuts if s[0] <= max_risk][:keep] shortcuts.sort(key=lambda x: x[-1], reverse=True ) # sort according to ground truth distance for logging log.debug('Kept %d shortcuts: %r...', len(shortcuts), shortcuts[:5]) for shortcut in shortcuts: risk, i1, i2, d, coord_dist = shortcut m.add_edge(i1, i2, loop_closure=True)
def _learner_load_model(self, policy_id, replacement_policy): log.debug('Asking learner %d to load model from %d', policy_id, replacement_policy) load_task = (PbtTask.LOAD_MODEL, (policy_id, replacement_policy)) learner_worker = self.learner_workers[policy_id] learner_worker.task_queue.put((TaskType.PBT, load_task))
def make_voxel_env(env_name, cfg=None, env_config=None, **kwargs): scenario_name = env_name.split('voxel_env_')[-1].casefold() log.debug('Using scenario %s', scenario_name) if 'multitask' in scenario_name: if env_config is not None and 'worker_index' in env_config: task_idx = env_config['worker_index'] else: log.warning( 'Could not find information about task id. Use task_id=0. (It is okay if this message appears once)' ) task_idx = 0 env = make_env_multitask( scenario_name, task_idx, num_envs=cfg.voxel_num_envs_per_instance, num_agents_per_env=cfg.voxel_num_agents_per_env, num_simulation_threads=cfg.voxel_num_simulation_threads, use_vulkan=cfg.voxel_use_vulkan, ) else: env = VoxelEnv( scenario_name=scenario_name, num_envs=cfg.voxel_num_envs_per_instance, num_agents_per_env=cfg.voxel_num_agents_per_env, num_simulation_threads=cfg.voxel_num_simulation_threads, use_vulkan=cfg.voxel_use_vulkan, ) env = Wrapper(env, cfg.voxel_increase_team_spirit, cfg.voxel_max_team_spirit_steps) return env
def print_stats(self, fps, sample_throughput, total_env_steps): fps_str = [] for interval, fps_value in zip(self.avg_stats_intervals, fps): fps_str.append( f'{int(interval * self.report_interval)} sec: {fps_value:.1f}') fps_str = f'({", ".join(fps_str)})' samples_per_policy = ', '.join( [f'{p}: {s:.1f}' for p, s in sample_throughput.items()]) lag_stats = self.policy_lag[0] lag = AttrDict() for key in ['min', 'avg', 'max']: lag[key] = lag_stats.get(f'version_diff_{key}', -1) policy_lag_str = f'min: {lag.min:.1f}, avg: {lag.avg:.1f}, max: {lag.max:.1f}' log.debug( 'Fps is %s. Total num frames: %d. Throughput: %s. Samples: %d. Policy #0 lag: (%s)', fps_str, total_env_steps, samples_per_policy, sum(self.samples_collected), policy_lag_str, ) if 'reward' in self.policy_avg_stats: policy_reward_stats = [] for policy_id in range(self.cfg.num_policies): reward_stats = self.policy_avg_stats['reward'][policy_id] if len(reward_stats) > 0: policy_reward_stats.append( (policy_id, f'{np.mean(reward_stats):.3f}')) log.debug('Avg episode reward: %r', policy_reward_stats)
def _save_reward_shaping(self, policy_id): policy_reward_shaping_filename = policy_reward_shaping_file( self.cfg, policy_id) with open(policy_reward_shaping_filename, 'w') as json_file: log.debug('Saving policy-specific reward shaping %d to file %s', policy_id, policy_reward_shaping_filename) json.dump(self.policy_reward_shaping[policy_id], json_file)
def load_from_checkpoint(cfg): filename = cfg_file(cfg) if not os.path.isfile(filename): raise Exception( f'Could not load saved parameters for experiment {cfg.experiment}') with open(filename, 'r') as json_file: json_params = json.load(json_file) log.warning('Loading existing experiment configuration from %s', filename) loaded_cfg = AttrDict(json_params) # override the parameters in config file with values passed from command line for key, value in cfg.cli_args.items(): if key in loaded_cfg and loaded_cfg[key] != value: log.debug( 'Overriding arg %r with value %r passed from command line', key, value) loaded_cfg[key] = value # incorporate extra CLI parameters that were not present in JSON file for key, value in vars(cfg).items(): if key not in loaded_cfg: log.debug( 'Adding new argument %r=%r that is not in the saved config file!', key, value) loaded_cfg[key] = value return loaded_cfg
def _save(self): checkpoint = self._get_checkpoint_dict() assert checkpoint is not None checkpoint_dir = self.checkpoint_dir(self.cfg, self.policy_id) tmp_filepath = join(checkpoint_dir, '.temp_checkpoint') checkpoint_name = f'checkpoint_{self.train_step:09d}_{self.env_steps}.pth' filepath = join(checkpoint_dir, checkpoint_name) log.info('Saving %s...', tmp_filepath) torch.save(checkpoint, tmp_filepath) log.info('Renaming %s to %s', tmp_filepath, filepath) os.rename(tmp_filepath, filepath) while len(self.get_checkpoints( checkpoint_dir)) > self.cfg.keep_checkpoints: oldest_checkpoint = self.get_checkpoints(checkpoint_dir)[0] if os.path.isfile(oldest_checkpoint): log.debug('Removing %s', oldest_checkpoint) os.remove(oldest_checkpoint) if self.cfg.save_milestones_sec > 0: # milestones enabled if time.time( ) - self.last_milestone_time >= self.cfg.save_milestones_sec: milestones_dir = ensure_dir_exists( join(checkpoint_dir, 'milestones')) milestone_path = join(milestones_dir, f'{checkpoint_name}.milestone') log.debug('Saving a milestone %s', milestone_path) shutil.copy(filepath, milestone_path) self.last_milestone_time = time.time()
def train(self, latest_batch_of_experience, env_steps, agent): # latest batch of experience is not used here if self.params.distance_network_checkpoint is None: # don't train distance net if it's already provided self.distance_buffer.extract_data( self.trajectory_buffer.complete_trajectories) if env_steps - self._last_trained > self.params.distance_train_interval: if self.distance_buffer.has_enough_data(): self.distance.train(self.distance_buffer.buffer, env_steps, agent) self._last_trained = env_steps # discard old experience self.distance_buffer.reset() # invalidate observation features because distance network has changed self.distance.obs_encoder.reset() if env_steps > self.params.distance_bootstrap and not self.is_initialized( ): log.debug('Curiosity is initialized @ %d steps!', env_steps) self.initialized = True self._expand_explored_region(env_steps, agent)
def _game_init(self, with_locking=True, max_parallel=10): lock_file = lock = None if with_locking: lock_file = doom_lock_file(max_parallel) lock = FileLock(lock_file) init_attempt = 0 while True: init_attempt += 1 try: if with_locking: with lock.acquire(timeout=20): self.game.init() else: self.game.init() break except Timeout: if with_locking: log.debug( 'Another process currently holds the lock %s, attempt: %d', lock_file, init_attempt, ) except Exception as exc: log.warning('VizDoom game.init() threw an exception %r. Terminate process...', exc) from envs.env_utils import EnvCriticalError raise EnvCriticalError()
def test_buffer_performance(self): small_buffer = Buffer() small_buffer.add_many(obs=np.zeros([1000, 84, 84, 3], dtype=np.uint8)) buffer = Buffer() t = Timing() with t.timeit('add'): for i in range(100): buffer.add_buff(small_buffer) huge_buffer = Buffer() with t.timeit('add_huge'): huge_buffer.add_buff(buffer) huge_buffer.add_buff(buffer) with t.timeit('single_add_small'): huge_buffer.add_buff(small_buffer) with t.timeit('clear_and_add'): huge_buffer.clear() huge_buffer.add_buff(buffer) huge_buffer.add_buff(buffer) with t.timeit('shuffle_and_add'): huge_buffer.clear() huge_buffer.add_buff(buffer) huge_buffer.add_buff(small_buffer) with t.timeit('shuffle'): huge_buffer.shuffle_data() log.debug('Timing: %s', t)
def train_feed_dict(self, env, data, params, use_gpu): num_batches = len(data.obs) // params.batch_size g = self.setup_graph(env, params, use_dataset=False) config = tf.ConfigProto(device_count={'GPU': 100 if use_gpu else 0}) with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) for _ in range(params.ppo_epochs): epoch_starts = time.time() for i in range(num_batches): start, end = i * params.batch_size, (i + 1) * params.batch_size kl, _ = sess.run( [g.objectives.sample_kl, g.train_op], feed_dict={ g.ph_observations: data.obs[start:end], g.ph_actions: data.act[start:end], g.ph_old_actions_probs: data.old_prob[start:end], g.ph_advantages: data.adv[start:end], g.ph_returns: data.ret[start:end], }) del kl time_per_epoch = time.time() - epoch_starts log.debug( 'Feed dict gpu %r: took %.3f seconds per epoch (%d batches, %d samples)', use_gpu, time_per_epoch, num_batches, len(data.obs), ) tf.reset_default_graph() gc.collect()
def main(): args, params = parse_args_tmax(AgentTMAX.Params) env_id = args.env global key_to_action if 'dmlab' in env_id: from utils.envs.dmlab import play_dmlab key_to_action = play_dmlab.key_to_action elif 'atari' in env_id: key_to_action = atari_utils.key_to_action elif 'doom' in env_id: key_to_action = doom_utils.key_to_action else: raise Exception('Unknown env') try: show_map = args.show_automap except AttributeError: show_map = False # start keypress listener (to pause/resume execution or exit) def start_listener(): with Listener(on_press=on_press, on_release=on_release) as listener: listener.join() listener_thread = Thread(target=start_listener) listener_thread.start() status = enjoy(params, args.env, show_automap=show_map) log.debug('Press ESC to exit...') listener_thread.join() return status
def set_gpus_for_process(process_idx, num_gpus_per_process, process_type, gpu_mask=None): available_gpus = get_available_gpus() if gpu_mask is not None: assert len(available_gpus) >= len(available_gpus) available_gpus = [available_gpus[g] for g in gpu_mask] num_gpus = len(available_gpus) gpus_to_use = [] if num_gpus == 0: os.environ[CUDA_ENVVAR] = '' log.debug('Not using GPUs for %s process %d', process_type, process_idx) else: first_gpu_idx = process_idx * num_gpus_per_process for i in range(num_gpus_per_process): index_mod_num_gpus = (first_gpu_idx + i) % num_gpus gpus_to_use.append(available_gpus[index_mod_num_gpus]) os.environ[CUDA_ENVVAR] = ','.join([str(g) for g in gpus_to_use]) log.info( 'Set environment var %s to %r for %s process %d', CUDA_ENVVAR, os.environ[CUDA_ENVVAR], process_type, process_idx, ) log.debug('Visible devices: %r', torch.cuda.device_count()) return gpus_to_use
def add_trajectory_to_dense_map(self, existing_map, traj): t = Timing() m = existing_map m.new_episode() # just in case node_idx = [-1] * len( traj) # index map from trajectory frame to graph node idx node_idx[ 0] = 0 # first observation is always the same (we start from the same initial state) with t.timeit('create_initial_map'): self._add_simple_path_to_map(m, traj, node_idx) # precalculate feature vectors for the distance network with t.timeit('cache_feature_vectors'): all_observations = [ m.get_observation(node) for node in m.graph.nodes ] obs_embeddings = self._calc_embeddings(all_observations) # with t.add_time('pairwise_distances'): # pairwise_distances = self._calc_pairwise_distances(obs_embeddings) # TODO: so far no shortcuts # with t.timeit('loop_closures'): # self._add_shortcuts(m, pairwise_distances) log.debug('Add trajectory to map, timing: %s', t) return m
def main(): experiments_dir = '/home/alex/all/projects/sample-factory/train_dir' all_experiment_dirs_list = [join(experiments_dir, v['dir']) for k, v in EXPERIMENTS.items()] for experiment_dir in all_experiment_dirs_list: log.debug('Experiment dir: %s', experiment_dir) log.debug('Total: %d', len(all_experiment_dirs_list)) for env, details in EXPERIMENTS.items(): env_dir = details['dir'] env_dir = join(experiments_dir, env_dir) event_files = Path(env_dir).rglob('*.tfevents.*') event_files = list(event_files) log.info('Event files: %r', event_files) env_dirs = set() for event_file in event_files: env_dirs.add(os.path.dirname(event_file)) EXPERIMENTS[env]['dirs'] = sorted(list(env_dirs)) log.info('Env dirs for env %s is %r', env, env_dirs) EXPERIMENT_GROUPS = (('dmlab30',),) for group_i, exp_group in enumerate(EXPERIMENT_GROUPS): fig, ax = plt.subplots(1, 1) ax = [ax] count = 0 for env in exp_group: experiments = EXPERIMENTS[env]['dirs'] aggregate(env, experiments, count, ax[count]) count += 1 # handles, labels = ax[-1].get_legend_handles_labels() # lgd = fig.legend(handles, labels, bbox_to_anchor=(0.1, 0.88, 0.8, 0.2), loc='lower left', ncol=4, mode="expand", prop={'size': 6}) # lgd.set_in_layout(True) # zhehui # plt.show() # plot_name = f'{env}_{key.replace("/", " ")}' # plt.tight_layout() # plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=1, wspace=0) # plt.subplots_adjust(wspace=0.12, hspace=0.15) plt.tight_layout(rect=(0, 0, 1.0, 0.9)) plt.margins(0, 0) plot_name = f'dmlab30' plt.savefig( os.path.join(os.getcwd(), f'../final_plots/reward_{plot_name}.pdf'), format='pdf', bbox_inches='tight', pad_inches=0, ) # plt.savefig(os.path.join(os.getcwd(), f'../final_plots/reward_{plot_name}.pdf'), format='pdf', bbox_extra_artists=(lgd,)) return 0
def _broadcast_model_weights(self): state_dict = self.dqn.main.state_dict() policy_version = self.train_step log.debug('Broadcast model weights for model version %d', policy_version) model_state = (policy_version, state_dict) for q in self.policy_worker_queues: q.put((TaskType.INIT_MODEL, model_state))
def all_accumulators_have_this_key(key): for scalar_accumulator in scalar_accumulators: if key not in scalar_accumulator.Keys(): log.debug('Not all of the accumulators have key %s', key) return False return True
def predict(self, imagined_action_lists): start = time.time() assert len(imagined_action_lists) == self.num_envs imagined_action_lists = np.split(np.array(imagined_action_lists), self.num_workers) for worker, imagined_action_list in zip(self.workers, imagined_action_lists): worker.task_queue.put( (imagined_action_list, MsgType.STEP_IMAGINED)) observations = [] rewards = [] dones = [] for worker in self.workers: worker.task_queue.join() results_per_worker = safe_get( worker.result_queue, timeout=1.0, msg= 'Took a surprisingly long time to predict the future, retrying...', ) assert len(results_per_worker) == len(imagined_action_lists[0]) for result in results_per_worker: o, r, d, _ = zip(*result) observations.append(o) rewards.append(r) dones.append(d) worker.result_queue.task_done() if self._verbose: log.debug('Prediction step took %.4f s', time.time() - start) return observations, rewards, dones
def main(): args, params = parse_args_tmax(AgentTMAX.Params) env_id = args.env global key_to_action if 'dmlab' in env_id: from utils.envs.dmlab import play_dmlab key_to_action = play_dmlab.key_to_action elif 'atari' in env_id: key_to_action = atari_utils.key_to_action elif 'doom' in env_id: key_to_action = doom_utils.key_to_action else: raise Exception('Unknown env') # start keypress listener (to pause/resume execution or exit) def start_listener(): with Listener(on_press=on_press, on_release=on_release) as listener: listener.join() listener_thread = Thread(target=start_listener) listener_thread.start() status = build_graph(params, args.env) if not terminate: log.debug('Press ESC to exit...') listener_thread.join() return status
def forward_pass(device_type): env_name = 'atari_breakout' cfg = default_cfg(algo='appooc', env=env_name) cfg.actor_critic_share_weights = True cfg.hidden_size = 128 cfg.use_rnn = True cfg.env_framestack = 4 env = create_env(env_name, cfg=cfg) torch.set_num_threads(1) torch.backends.cudnn.benchmark = True actor_critic = create_actor_critic(cfg, env.observation_space, env.action_space) device = torch.device(device_type) actor_critic.to(device) timing = Timing() with timing.timeit('all'): batch = 128 with timing.add_time('input'): # better avoid hardcoding here... observations = dict(obs=torch.rand([batch, 4, 84, 84]).to(device)) rnn_states = torch.rand([batch, get_hidden_size(cfg)]).to(device) n = 200 for i in range(n): with timing.add_time('forward'): output = actor_critic(observations, rnn_states) log.debug('Progress %d/%d', i, n) log.debug('Timing: %s', timing)
def _save_cfg(self, policy_id): policy_cfg_filename = policy_cfg_file(self.cfg, policy_id) with open(policy_cfg_filename, 'w') as json_file: log.debug('Saving policy-specific configuration %d to file %s', policy_id, policy_cfg_filename) json.dump(self.policy_cfg[policy_id], json_file)
def finish_initialization(self): """Wait until policy workers are fully initialized.""" for policy_id, workers in self.policy_workers.items(): for w in workers: log.debug('Waiting for policy worker %d-%d to finish initialization...', policy_id, w.worker_idx) w.init() log.debug('Policy worker %d-%d initialized!', policy_id, w.worker_idx)
def __init__(self, cfg, obs_space, timing): super().__init__(cfg, timing) self.basic_encoder = create_standard_encoder(cfg, obs_space, timing) self.encoder_out_size = self.basic_encoder.encoder_out_size # same as IMPALA paper self.embedding_size = 20 self.instructions_lstm_units = 64 self.instructions_lstm_layers = 1 padding_idx = 0 self.word_embedding = nn.Embedding( num_embeddings=DMLAB_VOCABULARY_SIZE, embedding_dim=self.embedding_size, padding_idx=padding_idx) self.instructions_lstm = nn.LSTM( input_size=self.embedding_size, hidden_size=self.instructions_lstm_units, num_layers=self.instructions_lstm_layers, batch_first=True, ) # learnable initial state? # initial_hidden_values = torch.normal(0, 1, size=(self.instructions_lstm_units, )) # self.lstm_h0 = nn.Parameter(initial_hidden_values, requires_grad=True) # self.lstm_c0 = nn.Parameter(initial_hidden_values, requires_grad=True) self.encoder_out_size += self.instructions_lstm_units log.debug('Policy head output size: %r', self.encoder_out_size) self.cpu_device = torch.device('cpu')
def predict(self, imagined_action_lists): start = time.time() assert len(imagined_action_lists) == self.num_envs imagined_action_lists = np.split(np.array(imagined_action_lists), self.num_workers) for worker, imagined_action_list in zip(self.workers, imagined_action_lists): worker.step_queue.put((imagined_action_list, StepType.IMAGINED)) observations = [] rewards = [] dones = [] for worker in self.workers: worker.step_queue.join() results_per_worker = worker.result_queue.get() assert len(results_per_worker) == len(imagined_action_list) for result in results_per_worker: o, r, d, _ = zip(*result) observations.append(o) rewards.append(r) dones.append(d) if self._verbose: log.debug('Prediction step took %.4f s', time.time() - start) return observations, rewards, dones
def _perturb_param(self, param, param_name, default_param): # toss a coin whether we perturb the parameter at all if random.random() > self.cfg.pbt_mutation_rate: return param if param != default_param and random.random() < 0.05: # small chance to replace parameter with a default value log.debug('%s changed to default value %r', param_name, default_param) return default_param if param_name in SPECIAL_PERTURBATION: new_value = SPECIAL_PERTURBATION[param_name](param, self.cfg) elif type(param) is bool: new_value = not param elif isinstance(param, numbers.Number): perturb_amount = random.uniform(1.01, 1.5) new_value = perturb_float(float(param), perturb_amount=perturb_amount) else: raise RuntimeError('Unsupported parameter type') log.debug('Param %s changed from %.6f to %.6f', param_name, param, new_value) return new_value
def find_available_port(start_port, increment=1000): port = start_port while port < 65535 and not is_udp_port_available(port): port += increment log.debug('Port %r is available', port) return port
def _learner_update_cfg(self, policy_id): learner_worker = self.learner_workers[policy_id] log.debug('Sending learning configuration to learner %d...', policy_id) cfg_task = (PbtTask.UPDATE_CFG, (policy_id, self.policy_cfg[policy_id])) learner_worker.task_queue.put((TaskType.PBT, cfg_task))
def step(self, actions): if self.skip_frames > 1 or self.num_agents == 1: # not used in multi-agent mode due to VizDoom limitations # this means that we have only one agent (+ maybe some bots, which is why we're in multiplayer mode) return super().step(actions) self._ensure_initialized() actions_binary = self._convert_actions(actions) self.game.set_action(actions_binary) self.game.advance_action(1, self.update_state) self.timestep += 1 if not self.update_state: return None, None, None, None state = self.game.get_state() reward = self.game.get_last_reward() done = self.game.is_episode_finished() if self.record_to is not None: # send 'stop recording' command 1 tick before the end of the episode # otherwise it does not get saved to disk if self.game.get_episode_time( ) + 1 == self.game.get_episode_timeout(): log.debug('Calling stop recording command!') self.game.send_game_command('stop') observation, done, info = self._process_game_step(state, done, {}) return observation, reward, done, info
def __init__(self, cfg, obs_space, timing): super().__init__(cfg, timing) obs_shape = get_obs_shape(obs_space) input_ch = obs_shape.obs[0] log.debug('Num input channels: %d', input_ch) if cfg.encoder_subtype == 'convnet_simple': conv_filters = [[input_ch, 32, 8, 4], [32, 64, 4, 2], [64, 128, 3, 2]] elif cfg.encoder_subtype == 'convnet_impala': conv_filters = [[input_ch, 16, 8, 4], [16, 32, 4, 2]] elif cfg.encoder_subtype == 'minigrid_convnet_tiny': conv_filters = [[3, 16, 3, 1], [16, 32, 2, 1], [32, 64, 2, 1]] else: raise NotImplementedError(f'Unknown encoder {cfg.encoder_subtype}') activation = nonlinearity(self.cfg) fc_layer_size = fc_after_encoder_size(self.cfg) encoder_extra_fc_layers = self.cfg.encoder_extra_fc_layers enc = self.ConvEncoderImpl(activation, conv_filters, fc_layer_size, encoder_extra_fc_layers, obs_shape) self.enc = torch.jit.script(enc) self.encoder_out_size = calc_num_elements(self.enc, obs_shape.obs) log.debug('Encoder output size: %r', self.encoder_out_size)
def make_dmlab_env_impl(spec, cfg, env_config, **kwargs): skip_frames = cfg.env_frameskip gpu_idx = 0 if len(cfg.dmlab_gpus) > 0: if kwargs.get('env_config') is not None: vector_index = kwargs['env_config']['vector_index'] gpu_idx = cfg.dmlab_gpus[vector_index % len(cfg.dmlab_gpus)] log.debug('Using GPU %d for DMLab rendering!', gpu_idx) task_id = get_task_id(env_config, spec, cfg) level = task_id_to_level(task_id, spec) log.debug('%r level %s task id %d', env_config, level, task_id) env = DmlabGymEnv( task_id, level, skip_frames, cfg.res_w, cfg.res_h, cfg.dmlab_throughput_benchmark, cfg.dmlab_renderer, get_dataset_path(cfg), cfg.dmlab_with_instructions, cfg.dmlab_extended_action_set, cfg.dmlab_use_level_cache, cfg.dmlab_level_cache_path, gpu_idx, spec.extra_cfg, ) if env_config and 'env_id' in env_config: env.seed(env_config['env_id']) if 'record_to' in cfg and cfg.record_to is not None: env = RecordingWrapper(env, cfg.record_to, 0) if cfg.pixel_format == 'CHW': env = PixelFormatChwWrapper(env) env = DmlabRewardShapingWrapper(env) return env