def train_distance(params, env_id): def make_env_func(): e = create_env(env_id) return e agent = AgentTMAX(make_env_func, params) agent.initialize() multi_env = None try: multi_env = MultiEnv( params.num_envs, params.num_workers, make_env_func=agent.make_env_func, stats_episodes=params.stats_episodes, ) train_loop(agent, multi_env) except (Exception, KeyboardInterrupt, SystemExit): log.exception('Interrupt...') finally: log.info('Closing env...') if multi_env is not None: multi_env.close() agent.finalize() return 0
def evaluate_experiment(env_id, experiment_name, num_envs=96): # fixed seeds random.seed(0) np.random.seed(0) tf.random.set_random_seed(0) params = AgentTMAX.Params(experiment_name) params = params.load() params.seed = 0 # for faster evaluation params.num_envs = num_envs params.num_workers = 32 if num_envs >= 32 else num_envs def make_env_func(): e = create_env(env_id, skip_frames=True) e.seed(0) return e agent = AgentTMAX(make_env_func, params) agent.initialize() rate, speed = 0, -1 multi_env = None try: multi_env = MultiEnv( params.num_envs, params.num_workers, make_env_func=make_env_func, stats_episodes=params.stats_episodes, ) success, avg_speed = evaluate_locomotion_agent(agent, multi_env) log.info('Finished evaluating experiment %s', experiment_name) rate = np.mean(success) speed = -1 avg_speed = [s for s in avg_speed if s > 0] if len(avg_speed) > 0: speed = np.mean(avg_speed) log.info('Success rate %.1f%%, avg. speed %.2f edges/frame', rate * 100, speed) except (Exception, KeyboardInterrupt, SystemExit): log.exception('Interrupt...') finally: log.info('Closing env...') if multi_env is not None: multi_env.close() agent.finalize() return rate, speed
def test_dist_training(self): t = Timing() def make_env(): return make_doom_env(doom_env_by_name(TEST_ENV_NAME)) params = AgentTMAX.Params('__test_dist_train__') params.distance_target_buffer_size = 1000 with t.timeit('generate_data'): # first: generate fake random data buffer = Buffer() obs1 = np.full([84, 84, 3], 0, dtype=np.uint8) obs1[:, :, 1] = 255 obs2 = np.full([84, 84, 3], 0, dtype=np.uint8) obs2[:, :, 2] = 255 data_size = params.distance_target_buffer_size for i in range(data_size): same = i % 2 == 0 if same: if random.random() < 0.5: obs_first = obs_second = obs1 else: obs_first = obs_second = obs2 else: obs_first, obs_second = obs1, obs2 if random.random() < 0.5: obs_second, obs_first = obs_first, obs_second buffer.add(obs_first=obs_first, obs_second=obs_second, labels=0 if same else 1) with t.timeit('init'): agent = AgentTMAX(make_env, params) agent.initialize() params.distance_train_epochs = 1 params.distance_batch_size = 256 agent.distance.train(buffer, 1, agent) with t.timeit('train'): params.distance_train_epochs = 2 params.distance_batch_size = 64 agent.distance.train(buffer, 1, agent, t) agent.finalize() log.info('Timing: %s', t) shutil.rmtree(params.experiment_dir())
def train(params, env_id): agent = AgentTMAX(partial(create_env, env=env_id), params=params) agent.initialize() status = agent.learn() agent.finalize() return status
def trajectory_to_map(params, env_id): def make_env_func(): e = create_env(env_id) e.seed(0) return e params.num_envs = 1 params.with_timer = False agent = AgentTMAX(make_env_func, params) agent.initialize() map_img, coord_limits = generate_env_map(make_env_func) experiment_dir = params.experiment_dir() trajectories_dir = ensure_dir_exists(join(experiment_dir, '.trajectories')) if params.persistent_map_checkpoint is None: prefix = 'traj_' all_trajectories = glob.glob(f'{trajectories_dir}/{prefix}*') all_trajectories.sort() trajectories = [] for i, trajectory_dir in enumerate(all_trajectories): with open(join(trajectory_dir, 'trajectory.pickle'), 'rb') as traj_file: traj = Trajectory(i) traj.__dict__.update(pickle.load(traj_file)) trajectories.append(traj) else: loaded_persistent_map = TopologicalMap.create_empty() loaded_persistent_map.maybe_load_checkpoint( params.persistent_map_checkpoint) num_trajectories = loaded_persistent_map.num_trajectories trajectories = [Trajectory(i) for i in range(num_trajectories)] zero_frame = loaded_persistent_map.graph.nodes[0] for i in range(1, num_trajectories): trajectories[i].add(zero_frame['obs'], -1, zero_frame['info']) for node in loaded_persistent_map.graph.nodes(data=True): node_idx, d = node trajectories[d['traj_idx']].add(d['obs'], -1, d['info']) log.info('Loaded %d trajectories from the map', num_trajectories) log.info('Trajectory lengths %r', [len(t) for t in trajectories]) def init_map(): return TopologicalMap( trajectories[0].obs[0], directed_graph=False, initial_info=trajectories[0].infos[0], ) map_builder = MapBuilder(agent) # trajectories = [map_builder.sparsify_trajectory(t) for t in trajectories] # TODO sparse_map = trajectories_to_sparse_map( init_map, trajectories, trajectories_dir, agent, map_img, coord_limits, ) test_pick_best_trajectory = True if test_pick_best_trajectory: pick_best_trajectory(init_map, agent, copy.deepcopy(trajectories)) m = init_map() for i, t in enumerate(trajectories): m = map_builder.add_trajectory_to_dense_map(m, t) map_builder.calc_distances_to_landmarks(sparse_map, m) map_builder.sieve_landmarks_by_distance(sparse_map) dense_map_dir = ensure_dir_exists(join(trajectories_dir, 'dense_map')) m.save_checkpoint(dense_map_dir, map_img=map_img, coord_limits=coord_limits, verbose=True) # check if landmark correspondence between dense and sparse map is correct for node, data in sparse_map.graph.nodes.data(): traj_idx = data['traj_idx'] frame_idx = data['frame_idx'] dense_map_landmark = m.frame_to_node_idx[traj_idx][frame_idx] log.info('Sparse map node %d corresponds to dense map node %d', node, dense_map_landmark) log.info('Sparse map node %d distance %d', node, data['distance']) obs_sparse = sparse_map.get_observation(node) obs_dense = m.get_observation(dense_map_landmark) assert np.array_equal(obs_sparse, obs_dense) show_landmarks = False if show_landmarks: import cv2 cv2.imshow('sparse', obs_sparse) cv2.imshow('dense', obs_dense) cv2.waitKey() agent.finalize() return 0
def enjoy(params, env_id, max_num_episodes=1000, max_num_frames=None, show_automap=False): def make_env_func(): e = create_env(env_id, mode='test', show_automap=show_automap) e.seed(0) return e params = params.load() params.num_envs = 1 # during execution we're only using one env agent = AgentTMAX(make_env_func, params) env = make_env_func() agent.initialize() global persistent_map if agent.params.persistent_map_checkpoint is not None: persistent_map = TopologicalMap.create_empty() persistent_map.maybe_load_checkpoint( agent.params.persistent_map_checkpoint) global current_landmark episode_rewards = [] num_frames = 0 def max_frames_reached(frames): return max_num_frames is not None and frames > max_num_frames for _ in range(max_num_episodes): env_obs, info = reset_with_info(env) done = False obs, goal_obs = main_observation(env_obs), goal_observation(env_obs) prev_obs = obs if current_landmark is None: current_landmark = obs if goal_obs is not None: goal_obs_rgb = cv2.cvtColor(goal_obs, cv2.COLOR_BGR2RGB) cv2.imshow('goal', cv2.resize(goal_obs_rgb, (500, 500))) cv2.waitKey(500) episode_reward, episode_frames = 0, 0 if not agent.tmax_mgr.initialized: agent.tmax_mgr.initialize([obs], [info], env_steps=0) persistent_map = agent.tmax_mgr.dense_persistent_maps[-1] sparse_persistent_map = agent.tmax_mgr.sparse_persistent_maps[-1] log.debug('Num landmarks in sparse map: %d', sparse_persistent_map.num_landmarks()) agent.curiosity.initialized = True agent.tmax_mgr.mode[0] = TmaxMode.EXPLORATION agent.tmax_mgr.locomotion_final_targets[0] = None agent.tmax_mgr.locomotion_targets[0] = None start_episode = time.time() t = Timing() while not done and not terminate and not max_frames_reached( num_frames): with t.timeit('one_frame'): env.render() cv2.waitKey(1) # to prevent window from fading if pause: time.sleep(0.01) continue if len(current_actions) > 0: # key combinations are not handled, but this is purely for testing action = current_actions[-1] else: action = 0 if policy_type == PolicyType.PLAYER: pass elif policy_type == PolicyType.RANDOM: action = env.action_space.sample() elif policy_type == PolicyType.AGENT: agent.tmax_mgr.mode[0] = TmaxMode.EXPLORATION action, *_ = agent.policy_step([prev_obs], [obs], [goal_obs], None, None) action = action[0] elif policy_type == PolicyType.LOCOMOTION: agent.tmax_mgr.mode[0] = TmaxMode.LOCOMOTION action, _, _ = agent.loco_actor_critic.invoke( agent.session, [obs], [current_landmark], None, None, [1.0], ) action = action[0] env_obs, rew, done, info = env.step(action) next_obs, goal_obs = main_observation( env_obs), goal_observation(env_obs) _, _ = agent.tmax_mgr.update( [obs], [next_obs], [rew], [done], [info], num_frames, t, verbose=True, ) prev_obs = obs obs = next_obs calc_distance_to_memory(agent, sparse_persistent_map, obs) calc_value_estimate(agent, obs) episode_reward += rew num_frames += 1 episode_frames += 1 took_seconds = t.one_frame desired_fps = 15 # (4-repeated here, which means actually 60fps) wait_seconds = (1.0 / desired_fps) - took_seconds wait_seconds = max(0.0, wait_seconds) if wait_seconds > EPS: time.sleep(wait_seconds) env.render() log.info('Actual fps: %.1f', episode_frames / (time.time() - start_episode)) time.sleep(0.2) episode_rewards.append(episode_reward) last_episodes = episode_rewards[-100:] avg_reward = sum(last_episodes) / len(last_episodes) log.info( 'Episode reward: %f, avg reward for %d episodes: %f', episode_reward, len(last_episodes), avg_reward, ) if max_frames_reached(num_frames) or terminate: break agent.finalize() env.close() cv2.destroyAllWindows() return 0
def test_locomotion(params, env_id): def make_env_func(): e = create_env(env_id, skip_frames=True) e.seed(0) return e # params = params.load() # params.ensure_serialized() params.num_envs = 1 # params.naive_locomotion = True agent = AgentTMAX(make_env_func, params) agent.initialize() env = make_env_func() env_obs, info = reset_with_info(env) obs_prev = obs = main_observation(env_obs) done = False if params.persistent_map_checkpoint is not None: loaded_persistent_map = TopologicalMap.create_empty() loaded_persistent_map.maybe_load_checkpoint( params.persistent_map_checkpoint) else: agent.tmax_mgr.initialize([obs], [info], 1) loaded_persistent_map = agent.tmax_mgr.dense_persistent_maps[-1] m = loaded_persistent_map t = Timing() log.info('Num landmarks: %d', m.num_landmarks()) final_goal_idx = 49 log.info('Locomotion goal is %d', final_goal_idx) # localizer = Localizer(m, agent) final_goal_obs = m.get_observation(final_goal_idx) cv2.namedWindow('next_target') cv2.moveWindow('next_target', 800, 100) cv2.namedWindow('final_goal') cv2.moveWindow('final_goal', 1400, 100) display_obs('next_target', obs) display_obs('final_goal', final_goal_obs) cv2.waitKey(1) # localizer.current_landmark = 0 # next_target = localizer.get_next_target(obs, final_goal_idx) # next_target_obs = m.get_observation(next_target) frame = 0 if params.naive_locomotion: navigator = NavigatorNaive(agent) else: navigator = Navigator(agent) navigator.reset(0, m) next_target, next_target_d = navigator.get_next_target( [m], [obs], [final_goal_idx], [frame], ) next_target, next_target_d = next_target[0], next_target_d[0] next_target_obs = m.get_observation(next_target) while not done and not terminate: with t.timeit('one_frame'): env.render() if not pause: if random.random() < 0.5: deterministic = False else: deterministic = True if params.naive_locomotion: action = navigator.replay_action([0])[0] else: action = agent.locomotion.navigate( agent.session, [obs_prev], [obs], [next_target_obs], deterministic=deterministic, )[0] env_obs, rew, done, info = env.step(action) log.info('Action is %d', action) obs_prev = obs obs = main_observation(env_obs) next_target, next_target_d = navigator.get_next_target( [m], [obs], [final_goal_idx], [frame], ) next_target, next_target_d = next_target[0], next_target_d[0] if next_target is None: log.error('We are lost!') else: log.info('Next target is %d with distance %.3f!', next_target, next_target_d) display_obs('next_target', next_target_obs) cv2.waitKey(1) if next_target is not None: next_target_obs = m.get_observation(next_target) log.info('Frame %d...', frame) took_seconds = t.one_frame desired_fps = 10 wait_seconds = (1.0 / desired_fps) - took_seconds wait_seconds = max(0.0, wait_seconds) if wait_seconds > EPS: time.sleep(wait_seconds) if not pause: frame += 1 log.info('After loop') env.render() time.sleep(0.05) env.close() agent.finalize() return 0