def forward_pass(device_type): env_name = 'atari_breakout' cfg = default_cfg(algo='appooc', env=env_name) cfg.actor_critic_share_weights = True cfg.hidden_size = 128 cfg.use_rnn = True cfg.env_framestack = 4 env = create_env(env_name, cfg=cfg) torch.set_num_threads(1) torch.backends.cudnn.benchmark = True actor_critic = create_actor_critic(cfg, env.observation_space, env.action_space) device = torch.device(device_type) actor_critic.to(device) timing = Timing() with timing.timeit('all'): batch = 128 with timing.add_time('input'): # better avoid hardcoding here... observations = dict(obs=torch.rand([batch, 4, 84, 84]).to(device)) rnn_states = torch.rand([batch, get_hidden_size(cfg)]).to(device) n = 200 for i in range(n): with timing.add_time('forward'): output = actor_critic(observations, rnn_states) log.debug('Progress %d/%d', i, n) log.debug('Timing: %s', timing)
def test_multi_env_performance(test, env_type, num_envs, num_workers): t = Timing() with t.timeit('init'): multi_env = MultiEnv(num_envs, num_workers, test.make_env, stats_episodes=100) total_num_frames, frames = 20000, 0 with t.timeit('first_reset'): multi_env.reset() next_print = print_step = 10000 with t.timeit('experience'): while frames < total_num_frames: _, _, done, info = multi_env.step([0] * num_envs) frames += num_env_steps(info) if frames > next_print: log.info('Collected %d frames of experience...', frames) next_print += print_step fps = total_num_frames / t.experience log.debug('%s performance:', env_type) log.debug('Took %.3f sec to collect %d frames in parallel, %.1f FPS', t.experience, total_num_frames, fps) log.debug('Timing: %s', t) multi_env.close()
def evaluate_locomotion(): experiments = ( # ('doom_textured_super_sparse', 'doom_textured_super_sparse-tmax_v035-64filt'), # ('doom_textured_super_sparse', 'doom_textured_super_sparse-tmax_v035-gamma-0998'), # ('doom_maze_no_goal', 'doom_maze_no_goal-tmax_v035_dist_expl'), # ('doom_maze_no_goal', 'doom_maze_no_goal-tmax_v035_no_spars'), ('doom_textured_super_sparse_v2', 'doom_textured_super_sparse_v2_trajectory'), ) t = Timing() with t.timeit('evaluation'): results = {} for experiment in experiments: env_id, exp_name = experiment rate, speed = evaluate_experiment(env_id, exp_name) results[exp_name] = (rate, speed) log.info('Evaluation completed, took %s', t) rates, speeds = [], [] for exp_name, r in results.items(): rate, speed = r log.info('%s: success_rate: %.1f%%, avg_speed %.3f', exp_name, rate * 100, speed) rates.append(rate) speeds.append(speed) log.info('Average across experiments: success %.1f%%, speed: %.3f', np.mean(rates) * 100, np.mean(speeds)) return 0
def add_trajectory_to_dense_map(self, existing_map, traj): t = Timing() m = existing_map m.new_episode() # just in case node_idx = [-1] * len( traj) # index map from trajectory frame to graph node idx node_idx[ 0] = 0 # first observation is always the same (we start from the same initial state) with t.timeit('create_initial_map'): self._add_simple_path_to_map(m, traj, node_idx) # precalculate feature vectors for the distance network with t.timeit('cache_feature_vectors'): all_observations = [ m.get_observation(node) for node in m.graph.nodes ] obs_embeddings = self._calc_embeddings(all_observations) # with t.add_time('pairwise_distances'): # pairwise_distances = self._calc_pairwise_distances(obs_embeddings) # TODO: so far no shortcuts # with t.timeit('loop_closures'): # self._add_shortcuts(m, pairwise_distances) log.debug('Add trajectory to map, timing: %s', t) return m
def test_gumbel_trick(self): """ We use a Gumbel noise which seems to be faster compared to using pytorch multinomial. Here we test that those are actually equivalent. """ timing = Timing() torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True with torch.no_grad(): action_space = gym.spaces.Discrete(8) num_logits = calc_num_logits(action_space) device_type = 'cpu' device = torch.device(device_type) logits = torch.rand(self.batch_size, num_logits, device=device) * 10.0 - 5.0 if device_type == 'cuda': torch.cuda.synchronize(device) count_gumbel, count_multinomial = np.zeros( [action_space.n]), np.zeros([action_space.n]) # estimate probability mass by actually sampling both ways num_samples = 20000 action_distribution = get_action_distribution(action_space, logits) sample_actions_log_probs(action_distribution) action_distribution.sample_gumbel() with timing.add_time('gumbel'): for i in range(num_samples): action_distribution = get_action_distribution( action_space, logits) samples_gumbel = action_distribution.sample_gumbel() count_gumbel[samples_gumbel[0]] += 1 action_distribution = get_action_distribution(action_space, logits) action_distribution.sample() with timing.add_time('multinomial'): for i in range(num_samples): action_distribution = get_action_distribution( action_space, logits) samples_multinomial = action_distribution.sample() count_multinomial[samples_multinomial[0]] += 1 estimated_probs_gumbel = count_gumbel / float(num_samples) estimated_probs_multinomial = count_multinomial / float( num_samples) log.debug('Gumbel estimated probs: %r', estimated_probs_gumbel) log.debug('Multinomial estimated probs: %r', estimated_probs_multinomial) log.debug('Sampling timing: %s', timing) time.sleep(0.1) # to finish logging
def record_trajectory(params, env_id): def make_env_func(): e = create_env(env_id, skip_frames=True) e.seed(0) return e env = make_env_func() map_img, coord_limits = generate_env_map(make_env_func) env_obs, info = reset_with_info(env) obs = main_observation(env_obs) done = False m = TopologicalMap(obs, directed_graph=False, initial_info=info, verbose=True) trajectory = Trajectory(env_idx=-1) frame = 0 t = Timing() while not done and not terminate: with t.timeit('one_frame'): env.render() if len(current_actions) > 0: action = current_actions[-1] else: action = 0 trajectory.add(obs, action, info) m.add_landmark(obs, info, update_curr_landmark=True) env_obs, rew, done, info = env.step(action) obs = main_observation(env_obs) took_seconds = t.one_frame desired_fps = 15 wait_seconds = (1.0 / desired_fps) - took_seconds wait_seconds = max(0.0, wait_seconds) time.sleep(wait_seconds) frame += 1 env.render() time.sleep(0.2) trajectory_dir = trajectory.save(params.experiment_dir()) m.save_checkpoint(trajectory_dir, map_img=map_img, coord_limits=coord_limits, verbose=True) env.close() return 0
def _learn_loop(self, multi_env): """Main training loop.""" step, env_steps = self.session.run([self.actor_step, self.total_env_steps]) env_obs = multi_env.reset() observations, goals = main_observation(env_obs), goal_observation(env_obs) buffer = PPOBuffer() def end_of_training(s, es): return s >= self.params.train_for_steps or es > self.params.train_for_env_steps while not end_of_training(step, env_steps): timing = Timing() num_steps = 0 batch_start = time.time() buffer.reset() with timing.timeit('experience'): # collecting experience for rollout_step in range(self.params.rollout): actions, action_probs, values = self.actor_critic.invoke(self.session, observations, goals=goals) # wait for all the workers to complete an environment step env_obs, rewards, dones, infos = multi_env.step(actions) self.process_infos(infos) new_observations, new_goals = main_observation(env_obs), goal_observation(env_obs) # add experience from all environments to the current buffer buffer.add(observations, actions, action_probs, rewards, dones, values, goals) observations = new_observations goals = new_goals num_steps += num_env_steps(infos) # last step values are required for TD-return calculation _, _, values = self.actor_critic.invoke(self.session, observations, goals=goals) buffer.values.append(values) env_steps += num_steps # calculate discounted returns and GAE buffer.finalize_batch(self.params.gamma, self.params.gae_lambda) # update actor and critic with timing.timeit('train'): step = self._train(buffer, env_steps) avg_reward = multi_env.calc_avg_rewards(n=self.params.stats_episodes) avg_length = multi_env.calc_avg_episode_lengths(n=self.params.stats_episodes) fps = num_steps / (time.time() - batch_start) self._maybe_print(step, env_steps, avg_reward, avg_length, fps, timing) self._maybe_aux_summaries(env_steps, avg_reward, avg_length, fps) self._maybe_update_avg_reward(avg_reward, multi_env.stats_num_episodes()) self._maybe_coverage_summaries(env_steps)
def test_dist_training(self): t = Timing() def make_env(): return make_doom_env(doom_env_by_name(TEST_ENV_NAME)) params = AgentTMAX.Params('__test_dist_train__') params.distance_target_buffer_size = 1000 with t.timeit('generate_data'): # first: generate fake random data buffer = Buffer() obs1 = np.full([84, 84, 3], 0, dtype=np.uint8) obs1[:, :, 1] = 255 obs2 = np.full([84, 84, 3], 0, dtype=np.uint8) obs2[:, :, 2] = 255 data_size = params.distance_target_buffer_size for i in range(data_size): same = i % 2 == 0 if same: if random.random() < 0.5: obs_first = obs_second = obs1 else: obs_first = obs_second = obs2 else: obs_first, obs_second = obs1, obs2 if random.random() < 0.5: obs_second, obs_first = obs_first, obs_second buffer.add(obs_first=obs_first, obs_second=obs_second, labels=0 if same else 1) with t.timeit('init'): agent = AgentTMAX(make_env, params) agent.initialize() params.distance_train_epochs = 1 params.distance_batch_size = 256 agent.distance.train(buffer, 1, agent) with t.timeit('train'): params.distance_train_epochs = 2 params.distance_batch_size = 64 agent.distance.train(buffer, 1, agent, t) agent.finalize() log.info('Timing: %s', t) shutil.rmtree(params.experiment_dir())
def episodic_memory_summary(self, env_steps, summary_writer, **kwargs): t = Timing() with t.timeit('ecr_memory'): time_since_last = time.time() - self._last_map_summary map_summary_rate_seconds = 120 if time_since_last <= map_summary_rate_seconds: return if self.episodic_memories is None: return env_to_plot = 0 for env_i, memory in enumerate(self.episodic_memories): if len(memory) > len(self.episodic_memories[env_to_plot]): env_to_plot = env_i log.info('Visualizing episodic memory for env %d', env_to_plot) memory_to_plot = self.episodic_memories[env_to_plot] if len(memory_to_plot) <= 0: return landmark_indices = sorted(memory_to_plot.landmarks.keys()) m = TopologicalMap( memory_to_plot.landmarks[landmark_indices[0]].embedding, directed_graph=False, initial_info=memory_to_plot.landmarks[ landmark_indices[0]].info, ) for lm_idx in landmark_indices[1:]: info = memory_to_plot.landmarks[lm_idx].info # noinspection PyProtectedMember m._add_new_node( obs=memory_to_plot.landmarks[lm_idx].embedding, pos=get_position(info), angle=get_angle(info), ) map_img = kwargs.get('map_img') coord_limits = kwargs.get('coord_limits') map_summaries([m], env_steps, summary_writer, 'ecr', map_img, coord_limits, is_sparse=True) self._last_map_summary = time.time() log.info('Took %s', t)
def test_buffer_performance(self): small_buffer = Buffer() small_buffer.add_many(obs=np.zeros([1000, 84, 84, 3], dtype=np.uint8)) buffer = Buffer() t = Timing() with t.timeit('add'): for i in range(100): buffer.add_buff(small_buffer) huge_buffer = Buffer() with t.timeit('add_huge'): huge_buffer.add_buff(buffer) huge_buffer.add_buff(buffer) with t.timeit('single_add_small'): huge_buffer.add_buff(small_buffer) with t.timeit('clear_and_add'): huge_buffer.clear() huge_buffer.add_buff(buffer) huge_buffer.add_buff(buffer) with t.timeit('shuffle_and_add'): huge_buffer.clear() huge_buffer.add_buff(buffer) huge_buffer.add_buff(small_buffer) with t.timeit('shuffle'): huge_buffer.shuffle_data() log.debug('Timing: %s', t)
def calc_test_error(self, buffer, env_steps, agent, timing=None): log.info('Calculating distance net test error...') if timing is None: timing = Timing() params = agent.params batch_size = params.distance_batch_size dist_step = self.step.eval(session=agent.session) with timing.timeit('dist_test_error'): losses = [] obs_first, obs_second, labels = buffer.obs_first, buffer.obs_second, buffer.labels for i in range(0, len(obs_first) - 1, batch_size): start, end = i, i + batch_size loss = agent.session.run(self.loss, feed_dict={ self.ph_obs_first: obs_first[start:end], self.ph_obs_second: obs_second[start:end], self.ph_labels: labels[start:end], self.ph_is_training: False, }) losses.append(loss) avg_loss = np.mean(losses) log.info('Avg loss at %d steps is %.3f', dist_step, avg_loss) summary_obj_env_steps = tf.Summary() summary_obj_env_steps.value.add(tag='distance/test_loss_env_steps', simple_value=avg_loss) agent.summary_writer.add_summary(summary_obj_env_steps, env_steps) summary_obj_training_steps = tf.Summary() summary_obj_training_steps.value.add( tag='distance/test_loss_train_steps', simple_value=avg_loss) agent.summary_writer.add_summary(summary_obj_training_steps, dist_step) agent.summary_writer.flush() log.debug('Took %s', timing)
def test_quad_env(self): self.assertIsNotNone(create_env('quadrotor_single')) env = create_env('quadrotor_single') obs = env.reset() n_frames = 10000 timing = Timing() with timing.timeit('step'): for i in range(n_frames): obs, r, d, info = env.step(env.action_space.sample()) if d: env.reset() log.debug('Time %s, FPS %.1f', timing, n_frames / timing.step)
def test_quad_multi_env(self): env_name = 'quadrotor_multi' cfg = default_cfg(env=env_name) self.assertIsNotNone(create_env(env_name, cfg=cfg)) env = create_env(env_name, cfg=cfg) env.reset() n_frames = 1000 timing = Timing() with timing.timeit('step'): for i in range(n_frames): obs, r, d, info = env.step( [env.action_space.sample() for _ in range(env.num_agents)]) log.debug('Time %s, FPS %.1f', timing, n_frames / timing.step)
def test_env_performance(make_env, env_type, verbose=False): t = Timing() with t.timeit('init'): env = make_env(AttrDict({'worker_index': 0, 'vector_index': 0})) total_num_frames, frames = 10000, 0 with t.timeit('first_reset'): env.reset() t.reset = t.step = 1e-9 num_resets = 0 with t.timeit('experience'): while frames < total_num_frames: done = False start_reset = time.time() env.reset() t.reset += time.time() - start_reset num_resets += 1 while not done and frames < total_num_frames: start_step = time.time() if verbose: env.render() time.sleep(1.0 / 40) obs, rew, done, info = env.step(env.action_space.sample()) if verbose: log.info('Received reward %.3f', rew) t.step += time.time() - start_step frames += num_env_steps([info]) fps = total_num_frames / t.experience log.debug('%s performance:', env_type) log.debug('Took %.3f sec to collect %d frames on one CPU, %.1f FPS', t.experience, total_num_frames, fps) log.debug('Avg. reset time %.3f s', t.reset / num_resets) log.debug('Timing: %s', t) env.close()
def _train_loop(self): timing = Timing() self.initialize(timing) wait_times = deque([], maxlen=self.cfg.num_workers) last_cache_cleanup = time.time() num_batches_processed = 0 while not self.terminate: with timing.timeit('train_wait'): data = safe_get(self.experience_buffer_queue) if self.terminate: break wait_stats = None wait_times.append(timing.train_wait) if len(wait_times) >= wait_times.maxlen: wait_times_arr = np.asarray(wait_times) wait_avg = np.mean(wait_times_arr) wait_min, wait_max = wait_times_arr.min(), wait_times_arr.max() # log.debug( # 'Training thread had to wait %.5f s for the new experience buffer (avg %.5f)', # timing.train_wait, wait_avg, # ) wait_stats = (wait_avg, wait_min, wait_max) self._process_training_data(data, timing, wait_stats) num_batches_processed += 1 if time.time() - last_cache_cleanup > 300.0 or ( not self.cfg.benchmark and num_batches_processed < 50): if self.cfg.device == 'gpu': torch.cuda.empty_cache() torch.cuda.ipc_collect() last_cache_cleanup = time.time() time.sleep(0.3) log.info('Train loop timing: %s', timing) del self.actor_critic del self.device
def run_multi_quadrotor_env(env_name, cfg): env = create_env(env_name, cfg=cfg) env.reset() for i in range(100): obs, r, d, info = env.step( [env.action_space.sample() for _ in range(env.num_agents)]) n_frames = 1000 env = create_env(env_name, cfg=cfg) env.reset() timing = Timing() with timing.timeit('step'): for i in range(n_frames): obs, r, d, info = env.step( [env.action_space.sample() for _ in range(env.num_agents)]) log.debug('Time %s, FPS %.1f', timing, n_frames * env.num_agents / timing.step) env.close()
def create_dqn(cfg, obs_space, action_space, timing=None): if timing is None: timing = Timing() def make_encoder(): return create_encoder(cfg, obs_space, timing) def make_core(encoder): return create_core(cfg, encoder.get_encoder_out_size()) main = _SimpleDQN(make_encoder, make_core, action_space, cfg, timing) target = _SimpleDQN(make_encoder, make_core, action_space, cfg, timing) return _DQN(main, target)
def create_actor_critic(cfg, obs_space, action_space, timing=None): if timing is None: timing = Timing() def make_encoder(): return create_encoder(cfg, obs_space, timing) def make_core(encoder): return create_core(cfg, encoder.get_encoder_out_size()) if cfg.actor_critic_share_weights: return _ActorCriticSharedWeights(make_encoder, make_core, action_space, cfg, timing) else: return _ActorCriticSeparateWeights(make_encoder, make_core, action_space, cfg, timing)
def __init__(self, args): """ This initialises our main class, it expects the arguments to be passed in. """ # Initialise the logger used throughout the whole script self.logger = init_logging() # String to print in case a command finishes successfully self.success_symbol = u"\u2713" # String to print in case a command fails self.failure_symbol = u"\u2717" # The passed arguments self.import_file = args.import_file if "import_file" in args else "" self.dry_run = args.dry_run if "dry_run" in args else False self.compare = args.compare if "compare" in args else False self.list_add = args.list_add if "list_add" in args else None # The mode to operate in self.mode = MODE_BATCH if "import_file" in args else MODE_INTERACTIVE # The Marionette instance, wrapped by our own helper class self.marionette = MarionetteHelper(self.logger, self.success_symbol, self.failure_symbol) # A set of existing bookmarks to check later if a bookmark was already saved self.bookmarks = set() # Initialise Google Maps API try: path = os.path.dirname(os.path.realpath(__file__)) key_file = "gm-api-key.json" with open("{}/{}".format(path, key_file), "r") as f: data = json.load(f) self.gm = googlemaps.Client(key=data["key"]) except IOError: self.logger.error( u" > [ERROR] Unable to open '{}', Google Maps API disabled {}". format(key_file, self.failure_symbol)) self.gm = None # Initialise timing self.timing = Timing(self.logger)
def test_env_performance(test, env_type): t = Timing() with t.timeit('init'): env = test.make_env() total_num_frames, frames = 4000, 0 agent = AgentRandom(test.make_env, {}) with t.timeit('first_reset'): env.reset() t.reset = t.step = 1e-9 num_resets = 0 with t.timeit('experience'): while frames < total_num_frames: done = False start_reset = time.time() env.reset() t.reset += time.time() - start_reset num_resets += 1 while not done and frames < total_num_frames: start_step = time.time() obs, rew, done, info = env.step(agent.best_action()) t.step += time.time() - start_step frames += num_env_steps([info]) fps = total_num_frames / t.experience log.debug('%s performance:', env_type) log.debug('Took %.3f sec to collect %d frames on one CPU, %.1f FPS', t.experience, total_num_frames, fps) log.debug('Avg. reset time %.3f s', t.reset / num_resets) log.debug('Timing: %s', t) env.close()
def _run(self): """ Main loop of the actor worker (rollout worker). Process tasks (mainly ROLLOUT_STEP) until we get the termination signal, which usually means end of training. Currently there is no mechanism to restart dead workers if something bad happens during training. We can only retry on the initial reset(). This is definitely something to work on. """ log.info('Initializing vector env runner %d...', self.worker_idx) # workers should ignore Ctrl+C because the termination is handled in the event loop by a special msg signal.signal(signal.SIGINT, signal.SIG_IGN) torch.multiprocessing.set_sharing_strategy('file_system') timing = Timing() last_report = time.time() with torch.no_grad(): while not self.terminate: try: try: with timing.add_time('waiting'), timing.timeit('wait_actor'): tasks = self.task_queue.get_many(timeout=0.1) except Empty: tasks = [] for task in tasks: task_type, data = task if task_type == TaskType.INIT: self._init() continue if task_type == TaskType.TERMINATE: self._terminate() break # handling actual workload if task_type == TaskType.ROLLOUT_STEP: if 'work' not in timing: timing.waiting = 0 # measure waiting only after real work has started with timing.add_time('work'), timing.timeit('one_step'): self._advance_rollouts(data, timing) elif task_type == TaskType.RESET: with timing.add_time('reset'): self._handle_reset() elif task_type == TaskType.PBT: self._process_pbt_task(data) if time.time() - last_report > 5.0 and 'one_step' in timing: timing_stats = dict(wait_actor=timing.wait_actor, step_actor=timing.one_step) memory_mb = memory_consumption_mb() stats = dict(memory_actor=memory_mb) self.report_queue.put(dict(timing=timing_stats, stats=stats)) last_report = time.time() except RuntimeError as exc: log.warning('Error while processing data w: %d, exception: %s', self.worker_idx, exc) log.warning('Terminate process...') self.terminate = True self.report_queue.put(dict(critical_error=self.worker_idx)) except KeyboardInterrupt: self.terminate = True except: log.exception('Unknown exception in rollout worker') self.terminate = True if self.worker_idx <= 1: time.sleep(0.1) log.info( 'Env runner %d, CPU aff. %r, rollouts %d: timing %s', self.worker_idx, psutil.Process().cpu_affinity(), self.num_complete_rollouts, timing, )
def run(self): """ This function contains the main loop of the algorithm, as well as initialization/cleanup code. :return: ExperimentStatus (SUCCESS, FAILURE, INTERRUPTED). Useful in testing. """ status = ExperimentStatus.SUCCESS if os.path.isfile(done_filename(self.cfg)): log.warning( 'Training already finished! Remove "done" file to continue training' ) return status self.init_workers() self.init_pbt() self.finish_initialization() log.info('Collecting experience...') timing = Timing() with timing.timeit('experience'): # noinspection PyBroadException try: while not self._should_end_training(): try: reports = self.report_queue.get_many(timeout=0.1) for report in reports: self.process_report(report) except Empty: pass if time.time() - self.last_report > self.report_interval: self.report() now = time.time() self.total_train_seconds += now - self.last_report self.last_report = now self.pbt.update(self.env_steps, self.policy_avg_stats) except Exception: log.exception('Exception in driver loop') status = ExperimentStatus.FAILURE except KeyboardInterrupt: log.warning( 'Keyboard interrupt detected in driver loop, exiting...') status = ExperimentStatus.INTERRUPTED for learner in self.learner_workers.values(): # timeout is needed here because some environments may crash on KeyboardInterrupt (e.g. VizDoom) # Therefore the learner train loop will never do another iteration and will never save the model. # This is not an issue with normal exit, e.g. due to desired number of frames reached. learner.save_model(timeout=5.0) all_workers = self.actor_workers for workers in self.policy_workers.values(): all_workers.extend(workers) all_workers.extend(self.learner_workers.values()) child_processes = list_child_processes() time.sleep(0.1) log.debug('Closing workers...') for i, w in enumerate(all_workers): w.close() time.sleep(0.01) for i, w in enumerate(all_workers): w.join() log.debug('Workers joined!') # VizDoom processes often refuse to die for an unidentified reason, so we're force killing them with a hack kill_processes(child_processes) fps = self.total_env_steps_since_resume / timing.experience log.info('Collected %r, FPS: %.1f', self.env_steps, fps) log.info('Timing: %s', timing) if self._should_end_training(): with open(done_filename(self.cfg), 'w') as fobj: fobj.write(f'{self.env_steps}') time.sleep(0.5) log.info('Done!') return status
def evaluate_locomotion_agent(agent, multi_env): num_envs = multi_env.num_envs observations = main_observation(multi_env.reset()) obs_prev = observations infos = multi_env.info() agent.tmax_mgr.initialize(observations, infos, 1) m = agent.tmax_mgr.dense_persistent_maps[-1] navigator = Navigator(agent) for env_i in range(num_envs): navigator.reset(env_i, m) # sample final goals all_targets = list(m.graph.nodes) if len(all_targets) > 0: all_targets.remove(0) final_goal_idx = random.sample(all_targets, num_envs) log.info('Goals: %r', final_goal_idx) # noinspection PyProtectedMember navigator._ensure_paths_to_goal_calculated([m] * num_envs, final_goal_idx) path_lengths = [0] * num_envs for env_i in range(num_envs): location, path_length = 0, 0 while location != final_goal_idx[env_i]: location = navigator.paths[env_i][location] path_length += 1 path_lengths[env_i] = path_length frames = 0 next_target, next_target_d = navigator.get_next_target( [m] * num_envs, observations, final_goal_idx, [frames] * num_envs, ) next_target_obs = [m.get_observation(t) for t in next_target] avg_speed = [-1] * num_envs success = [False] * num_envs t = Timing() while True: with t.timeit('frame'): with t.timeit('policy'): actions = policy_step(agent, obs_prev, observations, next_target_obs, final_goal_idx) with t.timeit('step'): env_obs, rew, done, info = multi_env.step(actions) obs_prev = observations observations = main_observation(env_obs) with t.timeit('navigator'): next_target, next_target_d = navigator.get_next_target( [m] * num_envs, observations, final_goal_idx, [frames] * num_envs, ) for env_i in range(num_envs): if final_goal_idx[env_i] is None: continue if next_target[env_i] is None: log.warning( 'Agent %d got lost in %d steps trying to reach %d', env_i, frames, final_goal_idx[env_i], ) final_goal_idx[env_i] = None else: if next_target[env_i] == final_goal_idx[env_i] and next_target_d[env_i] < 0.1: success[env_i] = True avg_speed[env_i] = path_lengths[env_i] / (frames + 1) log.debug( 'Agent %d reached goal %d in %d steps, avg. speed %.3f', env_i, final_goal_idx[env_i], frames, avg_speed[env_i], ) final_goal_idx[env_i] = None next_target_obs[env_i] = m.get_observation(next_target[env_i]) frames += 1 if frames > 5000: log.error('Timeout! 5000 frames was not enough to finish locomotion!') break finished = [g is None for g in final_goal_idx] if all(finished): log.info('Done!') break else: if frames % 10 == 0: frame_repeat = 4 fps = (1.0 / t.frame) * frame_repeat * num_envs log.info('%d agents remaining, fps %.3f, time %s', num_envs - sum(finished), fps, t) return success, avg_speed
def sample(self, proc_idx): # workers should ignore Ctrl+C because the termination is handled in the event loop by a special msg signal.signal(signal.SIGINT, signal.SIG_IGN) timing = Timing() from threadpoolctl import threadpool_limits with threadpool_limits(limits=1, user_api=None): if self.cfg.set_workers_cpu_affinity: set_process_cpu_affinity(proc_idx, self.cfg.num_workers) initial_cpu_affinity = psutil.Process().cpu_affinity( ) if platform != 'darwin' else None psutil.Process().nice(10) with timing.timeit('env_init'): envs = [] env_key = ['env' for _ in range(self.cfg.num_envs_per_worker)] for env_idx in range(self.cfg.num_envs_per_worker): global_env_id = proc_idx * self.cfg.num_envs_per_worker + env_idx env_config = AttrDict(worker_index=proc_idx, vector_index=env_idx, env_id=global_env_id) env = create_env(self.cfg.env, cfg=self.cfg, env_config=env_config) log.debug( 'CPU affinity after create_env: %r', psutil.Process().cpu_affinity() if platform != 'darwin' else 'MacOS - None') env.seed(global_env_id) envs.append(env) # this is to track the performance for individual DMLab levels if hasattr(env.unwrapped, 'level_name'): env_key[env_idx] = env.unwrapped.level_name episode_length = [0 for _ in envs] episode_lengths = [deque([], maxlen=20) for _ in envs] try: with timing.timeit('first_reset'): for env_idx, env in enumerate(envs): env.reset() log.info('Process %d finished resetting %d/%d envs', proc_idx, env_idx + 1, len(envs)) self.report_queue.put( dict(proc_idx=proc_idx, finished_reset=True)) self.start_event.wait() with timing.timeit('work'): last_report = last_report_frames = total_env_frames = 0 while not self.terminate.value and total_env_frames < self.cfg.sample_env_frames_per_worker: for env_idx, env in enumerate(envs): action = env.action_space.sample() with timing.add_time(f'{env_key[env_idx]}.step'): obs, reward, done, info = env.step(action) num_frames = info.get('num_frames', 1) total_env_frames += num_frames episode_length[env_idx] += num_frames if done: with timing.add_time( f'{env_key[env_idx]}.reset'): env.reset() episode_lengths[env_idx].append( episode_length[env_idx]) episode_length[env_idx] = 0 with timing.add_time('report'): now = time.time() if now - last_report > self.report_every_sec: last_report = now frames_since_last_report = total_env_frames - last_report_frames last_report_frames = total_env_frames self.report_queue.put( dict(proc_idx=proc_idx, env_frames=frames_since_last_report)) # Extra check to make sure cpu affinity is preserved throughout the execution. # I observed weird effect when some environments tried to alter affinity of the current process, leading # to decreased performance. # This can be caused by some interactions between deep learning libs, OpenCV, MKL, OpenMP, etc. # At least user should know about it if this is happening. cpu_affinity = psutil.Process().cpu_affinity( ) if platform != 'darwin' else None assert initial_cpu_affinity == cpu_affinity, \ f'Worker CPU affinity was changed from {initial_cpu_affinity} to {cpu_affinity}!' \ f'This can significantly affect performance!' except: log.exception('Unknown exception') log.error('Unknown exception in worker %d, terminating...', proc_idx) self.report_queue.put(dict(proc_idx=proc_idx, crash=True)) time.sleep(proc_idx * 0.01 + 0.01) log.info('Process %d finished sampling. Timing: %s', proc_idx, timing) for env_idx, env in enumerate(envs): if len(episode_lengths[env_idx]) > 0: log.warning('Level %s avg episode len %d', env_key[env_idx], np.mean(episode_lengths[env_idx])) for env in envs: env.close()
def _run(self): # workers should ignore Ctrl+C because the termination is handled in the event loop by a special msg signal.signal(signal.SIGINT, signal.SIG_IGN) try: psutil.Process().nice(self.cfg.default_niceness) except psutil.AccessDenied: log.error('Low niceness requires sudo!') if self.cfg.device == 'gpu': cuda_envvars(self.policy_id) torch.multiprocessing.set_sharing_strategy('file_system') torch.set_num_threads(self.cfg.learner_main_loop_num_cores) timing = Timing() rollouts = [] if self.train_in_background: self.training_thread.start() else: self.initialize(timing) log.error( 'train_in_background set to False on learner %d! This is slow, use only for testing!', self.policy_id, ) while not self.terminate: while True: try: tasks = self.task_queue.get_many(timeout=0.005) for task_type, data in tasks: if task_type == TaskType.TRAIN: with timing.add_time('extract'): rollouts.extend(self._extract_rollouts(data)) # log.debug('Learner %d has %d rollouts', self.policy_id, len(rollouts)) elif task_type == TaskType.INIT: self._init() elif task_type == TaskType.TERMINATE: time.sleep(0.3) log.info('GPU learner timing: %s', timing) self._terminate() break elif task_type == TaskType.PBT: self._process_pbt_task(data) except Empty: break if self._accumulated_too_much_experience(rollouts): # if we accumulated too much experience, signal the policy workers to stop experience collection if not self.stop_experience_collection[self.policy_id]: log.debug( 'Learner %d accumulated too much experience, stop experience collection!', self.policy_id) self.stop_experience_collection[self.policy_id] = True elif self.stop_experience_collection[self.policy_id]: # otherwise, resume the experience collection if it was stopped self.stop_experience_collection[self.policy_id] = False with self.resume_experience_collection_cv: log.debug('Learner %d is resuming experience collection!', self.policy_id) self.resume_experience_collection_cv.notify_all() with torch.no_grad(): rollouts = self._process_rollouts(rollouts, timing) if not self.train_in_background: while not self.experience_buffer_queue.empty(): training_data = self.experience_buffer_queue.get() self._process_training_data(training_data, timing) self._experience_collection_rate_stats() if self.train_in_background: self.experience_buffer_queue.put(None) self.training_thread.join()
def _run(self): # workers should ignore Ctrl+C because the termination is handled in the event loop by a special msg signal.signal(signal.SIGINT, signal.SIG_IGN) psutil.Process().nice(min(self.cfg.default_niceness + 2, 20)) cuda_envvars(self.policy_id) torch.multiprocessing.set_sharing_strategy('file_system') timing = Timing() with timing.timeit('init'): # initialize the Torch modules log.info('Initializing model on the policy worker %d-%d...', self.policy_id, self.worker_idx) torch.set_num_threads(1) if self.cfg.device == 'gpu': # we should already see only one CUDA device, because of env vars assert torch.cuda.device_count() == 1 self.device = torch.device('cuda', index=0) else: self.device = torch.device('cpu') self.actor_critic = create_actor_critic(self.cfg, self.obs_space, self.action_space, timing) self.actor_critic.model_to_device(self.device) for p in self.actor_critic.parameters(): p.requires_grad = False # we don't train anything here log.info('Initialized model on the policy worker %d-%d!', self.policy_id, self.worker_idx) last_report = last_cache_cleanup = time.time() last_report_samples = 0 request_count = deque(maxlen=50) # very conservative limit on the minimum number of requests to wait for # this will almost guarantee that the system will continue collecting experience # at max rate even when 2/3 of workers are stuck for some reason (e.g. doing a long env reset) # Although if your workflow involves very lengthy operations that often freeze workers, it can be beneficial # to set min_num_requests to 1 (at a cost of potential inefficiency, i.e. policy worker will use very small # batches) min_num_requests = self.cfg.num_workers // ( self.cfg.num_policies * self.cfg.policy_workers_per_policy) min_num_requests //= 3 min_num_requests = max(1, min_num_requests) # Again, very conservative timer. Only wait a little bit, then continue operation. wait_for_min_requests = 0.025 while not self.terminate: try: while self.stop_experience_collection[self.policy_id]: with self.resume_experience_collection_cv: self.resume_experience_collection_cv.wait(timeout=0.05) waiting_started = time.time() while len(self.requests) < min_num_requests and time.time( ) - waiting_started < wait_for_min_requests: try: with timing.timeit('wait_policy'), timing.add_time( 'wait_policy_total'): policy_requests = self.policy_queue.get_many( timeout=0.005) self.requests.extend(policy_requests) except Empty: pass self._update_weights(timing) with timing.timeit('one_step'), timing.add_time( 'handle_policy_step'): if self.initialized: if len(self.requests) > 0: request_count.append(len(self.requests)) self._handle_policy_steps(timing) try: task_type, data = self.task_queue.get_nowait() # task from the task_queue if task_type == TaskType.INIT: self._init() elif task_type == TaskType.TERMINATE: self.terminate = True break elif task_type == TaskType.INIT_MODEL: self._init_model(data) self.task_queue.task_done() except Empty: pass if time.time() - last_report > 3.0 and 'one_step' in timing: timing_stats = dict(wait_policy=timing.wait_policy, step_policy=timing.one_step) samples_since_last_report = self.total_num_samples - last_report_samples stats = memory_stats('policy_worker', self.device) if len(request_count) > 0: stats['avg_request_count'] = np.mean(request_count) self.report_queue.put( dict( timing=timing_stats, samples=samples_since_last_report, policy_id=self.policy_id, stats=stats, )) last_report = time.time() last_report_samples = self.total_num_samples if time.time() - last_cache_cleanup > 300.0 or ( not self.cfg.benchmark and self.total_num_samples < 1000): if self.cfg.device == 'gpu': torch.cuda.empty_cache() last_cache_cleanup = time.time() except KeyboardInterrupt: log.warning('Keyboard interrupt detected on worker %d-%d', self.policy_id, self.worker_idx) self.terminate = True except: log.exception('Unknown exception on policy worker') self.terminate = True time.sleep(0.2) log.info('Policy worker avg. requests %.2f, timing: %s', np.mean(request_count), timing)
def extract_data(self, trajectories): timing = Timing() if len(self.buffer) > self.params.distance_target_buffer_size: # already enough data return close, far = self.params.close_threshold, self.params.far_threshold num_close, num_far = 0, 0 data_added = 0 with timing.timeit('trajectories'): for trajectory in trajectories: check_tmax = isinstance(trajectory, TmaxTrajectory) obs = trajectory.obs indices = list(range(len(trajectory))) np.random.shuffle(indices) for i in indices: if len(self.buffer ) > self.params.distance_target_buffer_size // 2: if data_added > self.params.distance_target_buffer_size // 4: # to limit memory usage break if len(self.buffer ) > self.params.distance_target_buffer_size: break close_i = min(i + close, len(trajectory)) far_i = min(i + far, len(trajectory)) # sample close observation pair first_idx = i second_idx = np.random.randint(i, close_i) # in TMAX we do some additional checks add_close = True if check_tmax: both_frames_random = trajectory.is_random[ first_idx] and trajectory.is_random[second_idx] first_exploration = trajectory.mode[ first_idx] == TmaxMode.EXPLORATION second_exploration = trajectory.mode[ second_idx] == TmaxMode.EXPLORATION if both_frames_random or (first_exploration and second_exploration): add_close = True else: add_close = False if add_close: if self.params.distance_symmetric and random.random( ) < 0.5: first_idx, second_idx = second_idx, first_idx self.buffer.add(obs_first=obs[first_idx], obs_second=obs[second_idx], labels=0) data_added += 1 num_close += 1 # sample far observation pair if far_i < len(trajectory): first_idx = i second_idx = np.random.randint(far_i, len(trajectory)) add_far = True if check_tmax: both_frames_random = trajectory.is_random[ first_idx] and trajectory.is_random[second_idx] first_exploration = trajectory.mode[ first_idx] == TmaxMode.EXPLORATION second_exploration = trajectory.mode[ second_idx] == TmaxMode.EXPLORATION if both_frames_random or (first_exploration and second_exploration): add_far = True else: add_far = False if add_far: if self.params.distance_symmetric and random.random( ) < 0.5: first_idx, second_idx = second_idx, first_idx self.buffer.add(obs_first=obs[first_idx], obs_second=obs[second_idx], labels=1) data_added += 1 num_far += 1 with timing.timeit('finalize'): self.buffer.trim_at(self.params.distance_target_buffer_size) if self.batch_num % 20 == 0: with timing.timeit('visualize'): self._visualize_data() self.batch_num += 1 log.info('num close %d, num far %d, distance net timing %s', num_close, num_far, timing)
def extract_data(self, trajectories): timing = Timing() if len(trajectories) <= 0: return if len(self.buffer) > self.params.locomotion_experience_replay_buffer: return with timing.timeit('trajectories'): max_trajectory = self.params.locomotion_max_trajectory data_so_far = 0 trajectories = [ t for t in trajectories if len(t) > self.params.locomotion_max_trajectory ] # train only on random frames random_frames = [[ i for i, is_random in enumerate(t.is_random) if is_random ] for t in trajectories] total_experience = sum(len(frames) for frames in random_frames) max_total_experience = 0.75 * total_experience # max fraction of experience to use max_num_segments = int(max_total_experience / max_trajectory) log.info( '%d total experience from %d trajectories (%d segments)', max_total_experience, len(trajectories), max_num_segments, ) attempts = 0 while data_so_far < max_total_experience: attempts += 1 if attempts > 100 * max_total_experience: # just in case break trajectory_idx = random.choice(range(len(trajectories))) trajectory = trajectories[trajectory_idx] if len(random_frames[trajectory_idx]) <= max_trajectory: continue first_random_frame = random_frames[trajectory_idx][0] if len(trajectory) - first_random_frame < max_trajectory: continue # sample random interval in trajectory, treat the last frame as "imaginary" goal, use actions as # ground truth start_idx = random.randint(first_random_frame, len(trajectory) - 2) goal_idx = min(start_idx + max_trajectory, len(trajectory) - 1) assert start_idx < goal_idx if not trajectory.is_random[start_idx]: continue if not trajectory.is_random[goal_idx]: continue for i in range(start_idx, goal_idx): if not trajectory.is_random[i]: continue assert 0 < goal_idx - i <= max_trajectory self.buffer.add( obs_prev=trajectory.obs[max(0, i - 1)], obs_curr=trajectory.obs[i], obs_goal=trajectory.obs[goal_idx], actions=trajectory.actions[i], mode=trajectory.mode[i], diff=goal_idx - i, ) data_so_far += 1 if len(self.buffer ) > self.params.locomotion_experience_replay_buffer: break # if self.batch_num % 10 == 0: # with timing.timeit('vis'): # self._visualize_data(training_data) # with timing.timeit('finalize'): # for traj_buffer in training_data: # self.buffer.add_buff(traj_buffer) # self.shuffle_data() # self.buffer.trim_at(self.params.locomotion_experience_replay_buffer) self.batch_num += 1 log.info('Locomotion, buffer size: %d, timing: %s', len(self.buffer), timing)
def train(self, buffer, env_steps, agent, timing=None): if timing is None: timing = Timing() params = agent.params batch_size = params.distance_batch_size summary = None dist_step = self.step.eval(session=agent.session) prev_loss = 1e10 num_epochs = params.distance_train_epochs log.info('Train distance net %d pairs, batch %d, epochs %d', len(buffer), batch_size, num_epochs) with timing.timeit('dist_epochs'): for epoch in range(num_epochs): losses = [] with timing.add_time('shuffle'): buffer.shuffle_data() obs_first, obs_second, labels = buffer.obs_first, buffer.obs_second, buffer.labels with timing.add_time('batch'): for i in range(0, len(obs_first) - 1, batch_size): # noinspection PyProtectedMember with_summaries = agent._should_write_summaries( dist_step) and summary is None summaries = [self.summaries] if with_summaries else [] start, end = i, i + batch_size result = agent.session.run([self.loss, self.train_op] + summaries, feed_dict={ self.ph_obs_first: obs_first[start:end], self.ph_obs_second: obs_second[start:end], self.ph_labels: labels[start:end], self.ph_is_training: True, }) dist_step += 1 # noinspection PyProtectedMember agent._maybe_save(dist_step, env_steps) losses.append(result[0]) if with_summaries: summary = result[-1] agent.summary_writer.add_summary( summary, global_step=env_steps) # check loss improvement at the end of each epoch, early stop if necessary avg_loss = np.mean(losses) if avg_loss >= prev_loss: log.info( 'Early stopping after %d epochs because distance net did not improve', epoch + 1) log.info('Was %.4f now %.4f, ratio %.3f', prev_loss, avg_loss, avg_loss / prev_loss) break prev_loss = avg_loss return dist_step
def sample(self, proc_idx): # workers should ignore Ctrl+C because the termination is handled in the event loop by a special msg signal.signal(signal.SIGINT, signal.SIG_IGN) timing = Timing() psutil.Process().nice(10) num_envs = len(DMLAB30_LEVELS_THAT_USE_LEVEL_CACHE) assert self.cfg.num_workers % num_envs == 0, f'should have an integer number of workers per env, e.g. {1 * num_envs}, {2 * num_envs}, etc...' assert self.cfg.num_envs_per_worker == 1, 'use populate_cache with 1 env per worker' with timing.timeit('env_init'): env_key = 'env' env_desired_num_levels = 0 global_env_id = proc_idx * self.cfg.num_envs_per_worker env_config = AttrDict(worker_index=proc_idx, vector_index=0, env_id=global_env_id) env = create_env(self.cfg.env, cfg=self.cfg, env_config=env_config) env.seed(global_env_id) # this is to track the performance for individual DMLab levels if hasattr(env.unwrapped, 'level_name'): env_key = env.unwrapped.level_name env_level = env.unwrapped.level approx_num_episodes_per_1b_frames = DMLAB30_APPROX_NUM_EPISODES_PER_BILLION_FRAMES[ env_key] num_billions = DESIRED_TRAINING_LENGTH / int(1e9) num_workers_for_env = self.cfg.num_workers // num_envs env_desired_num_levels = int( (approx_num_episodes_per_1b_frames * num_billions) / num_workers_for_env) env_num_levels_generated = len(dmlab_level_cache.DMLAB_GLOBAL_LEVEL_CACHE[0]. all_seeds[env_level]) // num_workers_for_env log.warning('Worker %d (env %s) generated %d/%d levels!', proc_idx, env_key, env_num_levels_generated, env_desired_num_levels) time.sleep(4) env.reset() env_uses_level_cache = env.unwrapped.env_uses_level_cache self.report_queue.put(dict(proc_idx=proc_idx, finished_reset=True)) self.start_event.wait() try: with timing.timeit('work'): last_report = last_report_frames = total_env_frames = 0 while not self.terminate.value and total_env_frames < self.cfg.sample_env_frames_per_worker: action = env.action_space.sample() with timing.add_time(f'{env_key}.step'): env.step(action) total_env_frames += 1 with timing.add_time(f'{env_key}.reset'): env.reset() env_num_levels_generated += 1 log.debug('Env %s done %d/%d resets', env_key, env_num_levels_generated, env_desired_num_levels) if env_num_levels_generated >= env_desired_num_levels: log.debug('%s finished %d/%d resets, sleeping...', env_key, env_num_levels_generated, env_desired_num_levels) time.sleep(30) # free up CPU time for other envs # if env does not use level cache, there is no need to run it # let other workers proceed if not env_uses_level_cache: log.debug('Env %s does not require cache, sleeping...', env_key) time.sleep(200) with timing.add_time('report'): now = time.time() if now - last_report > self.report_every_sec: last_report = now frames_since_last_report = total_env_frames - last_report_frames last_report_frames = total_env_frames self.report_queue.put( dict(proc_idx=proc_idx, env_frames=frames_since_last_report)) if get_free_disk_space_mb(self.cfg) < 3 * 1024: log.error('Not enough disk space! %d', get_free_disk_space_mb(self.cfg)) time.sleep(200) except: log.exception('Unknown exception') log.error('Unknown exception in worker %d, terminating...', proc_idx) self.report_queue.put(dict(proc_idx=proc_idx, crash=True)) time.sleep(proc_idx * 0.1 + 0.1) log.info('Process %d finished sampling. Timing: %s', proc_idx, timing) env.close()