def __init__(self, env_name, discount, TH, memory_size=None): """A base class. Parameters ---------- env_name : experimental domain name in models.py discount : the discount factor in MDP TH : finite-time horizon (maximum learning steps) memory_size : Experience Replay memory size """ self.env = envs.make(env_name, type='classic_mdp') self.discount = discount self.states = [] self.actions = [] self.rewards = [] self.np_random, _ = seeding.np_random(None) self.test_counts = [] self.test_rewards = [] self.Q_err = [] self.Q_target = np.array(self.env.optQ(self.discount)).astype( np.float16) self.visits = np.zeros((self.env.snum, self.env.anum)) self.memory_size = memory_size self.replayMem = {(i, j): [] for i in range(self.env.snum) for j in range(self.env.anum)} if not (TH == None): self.env.set_time(TH)
def test_model(): env_name = 'DartHopperPT-v1' env = make_parallel(1, env_name, num=2) env2 = make(env_name, num=2, stochastic=False) batch_size = 30 horizon = 100 s = [] for i in range(batch_size): env2.reset() s.append(get_state(env2)) param = get_params(env2) params = np.array([param for i in range(batch_size)]) env2.env.noisy_input = False s = np.array(s) a = [[env2.action_space.sample() for j in range(horizon)] for i in range(batch_size)] a = np.array(a) for i in range(3): obs, _, done, _ = env2.step(a[-1][i]) if done: break for i in tqdm.trange(1): r, obs, mask = env(params, s, a) print(obs[-1][:3])
def __init__(self, make, env_name, num, stochastic_obs, done=True): self.env = make(env_name, num) self.env.reset() # TODO:close noise self.env.env.noisy_input = stochastic_obs self.done = done
def test(): env = envs.make(args.env, 'atari', render=bool(args.render), record=bool(args.record), directory=args.log_dir) learning_prop = json.load( open(os.path.join(args.log_dir, '../learning_prop.json'), 'r')) act_params = { 'scope': "seed_%d" % learning_prop['seed'] + "/" + learning_prop['scope'], 'eps': args.test_eps } act = deepq.load(os.path.join(args.log_dir, args.log_fname), act_params) episode_rew = 0 t = 0 while True: obs, done = env.reset(), False while (not done): if args.render: env.render() time.sleep(0.05) obs, rew, done, info = env.step(act(obs[None])[0]) # Reset only the enviornment but not the recorder if args.record and done: obs, done = env.env.reset(), False episode_rew += rew t += 1 if info['ale.lives'] == 0: print("Episode reward %.2f after %d steps" % (episode_rew, t)) episode_rew = 0 t = 0
def test(): import json learning_prop = json.load( open(os.path.join(args.log_dir, 'learning_prop.json'), 'r')) env = envs.make( args.env, render=bool(args.render), record=bool(args.record), ros=bool(args.ros), map_name=learning_prop['map'], num_targets=learning_prop['nb_targets'], is_training=False, ) act_params = {'scope': learning_prop['scope'], 'eps': args.test_eps} act = simple.load(os.path.join(args.log_dir, args.log_fname), act_params) while True: obs, done = env.reset(), False episode_rew = 0 while not done: if args.render: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew)
def test(seed): learning_prop = json.load( open(os.path.join(args.log_dir, 'learning_prop.json'), 'r')) env = envs.make( args.env, 'ma_target_tracking', render=bool(args.render), record=bool(args.record), directory=args.log_dir, ros=bool(args.ros), map_name=args.map, num_agents=args.nb_agents, #learning_prop['nb_agents'], num_targets=learning_prop['nb_targets'], is_training=False, ) act_params = { 'scope': "seed_%d" % learning_prop['seed'] + "/" + learning_prop['scope'], 'eps': args.test_eps } act = madeepq.load(os.path.join(args.log_dir, args.log_fname), act_params) from baselines0.evaluation import Test Eval = Test() Eval.test(args, env, act)
def run(): env = envs.make(args.env_name) flag_is_train = args.flag_is_train # flag_is_train = 1, 一个训练,一个使用; flag_is_train = 0, 两个都是在使用(根据train_agent状态决定输出谁的信息---flag_train_blue) flag_focus_blue = args.flag_focus_blue # flag_focus_blue = 1 时训练agent_blue; flag_train_blue = 0 时训练agent_red if flag_focus_blue: train_agent_name = 'blue' red_agent = DQN(env.state_dim, env.action_dim, is_train=False, scope='red') blue_agent = DQN(env.state_dim, env.action_dim, is_train=flag_is_train, scope='blue') alloc.check_scheme(blue_agent.is_train, red_agent.is_train, train_agent_name) run_AirCombat_selfPlay(env, blue_agent, red_agent, train_agent_name) else: train_agent_name = 'red' blue_agent = DQN(env.state_dim, env.action_dim, is_train=False, scope='blue') red_agent = DQN(env.state_dim, env.action_dim, is_train=flag_is_train, scope='red') alloc.check_scheme(blue_agent.is_train, red_agent.is_train, train_agent_name) run_AirCombat_selfPlay(env, red_agent, blue_agent, train_agent_name)
def run(): env = envs.make(args.env_name) train_agent = DQN2013(env.state_dim, env.action_dim, is_train=True, is_based=False, scope="guidence") run_GuidenceEnv(env, train_agent)
def test_env(): env_name = 'DartHopperPT-v1' env = make(env_name, num=2) #env = gym.make('Walker2d-v2') #env.reset() for i in tqdm.trange(10000): env.step(env.action_space.sample())
def main(): env = envs.make("airCobate") flag_is_train = args.flag_is_train # flag_is_train = 1, 一个训练,一个使用; flag_is_train = 0, 两个都是在使用(根据train_agent状态决定输出谁的信息---flag_train_blue) flag_train_blue = args.flag_train_blue # flag_train_blue = 1 时训练agent_blue; flag_train_blue = 0 时训练agent_red # todo: 创建多个agent,并传入 interactor的NANU 中 raise NotImplementedError
def train(seed, save_dir): logger.configure() set_global_seeds(seed) save_dir_0 = os.path.join(save_dir, 'seed_%d' % seed) os.makedirs(save_dir_0) env = envs.make(args.env, 'atari', record=bool(args.record), directory=save_dir_0) nb_test_steps = args.nb_test_steps if args.nb_test_steps > 0 else None with tf.device(args.device): with tf.compat.v1.variable_scope('seed_%d' % seed): model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=bool(args.dueling), ) act = deepq.learn( env, q_func=model, lr=args.learning_rate, lr_decay_factor=args.learning_rate_decay_factor, lr_growth_factor=args.learning_rate_growth_factor, max_timesteps=args.nb_train_steps, buffer_size=args.buffer_size, exploration_fraction=args.eps_fraction, exploration_final_eps=args.eps_min, train_freq=4, print_freq=1000, checkpoint_freq=int(args.nb_train_steps / 10), learning_starts=args.nb_warmup_steps, target_network_update_freq=args.target_update_freq, gamma=0.99, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, scope=args.scope, double_q=args.double_q, epoch_steps=args.nb_epoch_steps, eval_logger=Logger(args.env, 'atari', nb_test_steps=nb_test_steps, save_dir=save_dir_0, render=bool(args.render)), save_dir=save_dir_0, test_eps=args.test_eps, gpu_memory=args.gpu_memory, render=bool(args.render), ) print("Saving model to model.pkl") act.save(os.path.join(save_dir_0, "model.pkl")) env.close() if args.record == 1: env.moviewriter.finish()
def train(): set_global_seeds(args.seed) directory = os.path.join( args.log_dir, '_'.join([args.env, datetime.datetime.now().strftime("%m%d%H%M")])) if not os.path.exists(directory): os.makedirs(directory) else: ValueError("The directory already exists...", directory) json.dump(vars(args), open(os.path.join(directory, 'learning_prop.json'), 'w')) env = envs.make(args.env, render=bool(args.render), record=bool(args.record), dirname=directory) with tf.device(args.device): model = deepq.models.mlp([args.num_units] * args.num_layers) act, records = deepq.learn( env, q_func=model, lr=args.learning_rate, lr_decay_factor=args.learning_rate_decay_factor, lr_growth_factor=args.learning_rate_growth_factor, max_timesteps=args.nb_train_steps, buffer_size=args.buffer_size, batch_size=args.batch_size, exploration_fraction=args.eps_fraction, exploration_final_eps=args.eps_min, target_network_update_freq=args.target_update_freq, print_freq=10, checkpoint_freq=int(args.nb_train_steps / 10), learning_starts=args.nb_warmup_steps, gamma=args.gamma, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, callback=None, #callback, epoch_steps=args.nb_epoch_steps, gpu_memory=args.gpu_memory, save_dir=directory, double_q=args.double_q, nb_test_steps=args.nb_test_steps, test_eps=args.test_eps, render=bool(args.render), ) print("Saving model to model.pkl") act.save(os.path.join(directory, "model.pkl")) plot(records, directory) memo = input("Memo for this experiment?: ") f = open(os.path.join(directory, "memo.txt"), 'w') f.write(memo) f.close() if args.record == 1: env.moviewriter.finish()
def evaluation_maTTenv(act, env_id, eval_type='random', nb_itrs=5, render=False, **kwargs): """ Evaluation for the ttenv environments in a given set of different sampling zones. The set of different sampling zones is defined in MATTENV_EVAL_SET. """ if eval_type == 'random': params_set = [{}] elif eval_type == 'random_zone': params_set = MATTENV_EVAL_SET elif eval_type == 'fixed': params_set = [{'init_pose_list': kwargs['init_pose_list']}] elif eval_type == 'fixed_nb': if env_id == 'maTracking-v4': params_set = MA_EVAL else: raise ValueError("Wrong evaluation type for ttenv.") env = envs.make(env_id, 'ma_target_tracking', render=render, is_training=False, **kwargs) total_rewards, total_nlogdetcov = [], [] action_dict = {} for params in params_set: total_rewards_k, total_nlogdetcov_k = [], [] for _ in range(nb_itrs): obs = env.reset(**params) done = {} episode_reward, episode_nlogdetcov, t = 0, 0, 0 while (type(done) is dict): if render: env.render() for agent_id, a_obs in obs.items(): action_dict[agent_id] = act(np.array(a_obs)[None])[0] obs, rew, done, info = env.step(action_dict) episode_reward += rew['__all__'] episode_nlogdetcov += info['mean_nlogdetcov'] t += 1 total_rewards_k.append(episode_reward) total_nlogdetcov_k.append(episode_nlogdetcov) total_rewards.append(total_rewards_k) total_nlogdetcov.append(total_nlogdetcov_k) if render: env.close() if len(total_rewards) == 1: total_rewards = total_rewards[0] total_nlogdetcov = total_nlogdetcov[0] return np.array(total_rewards, dtype=np.float32), np.array(total_nlogdetcov, dtype=np.float32)
def test(env_id, isAtari, act_greedy, nb_itrs=3, nb_test_steps=10000, render=False): total_rewards = [] for _ in range(nb_itrs): if isAtari: from baselines0.common.atari_wrappers import make_atari env_new = make_atari(env_id) env_new = deepq.wrap_atari_dqn(env_new) else: env_new = envs.make(env_id, render, figID=1) obs = env_new.reset() if nb_test_steps is None: done_test = False episode_reward = 0 t = 0 while not done_test: action = act_greedy(np.array(obs)[None])[0] obs, rew, done, info = env_new.step(action) if render: env_new.render(mode='test') episode_reward += rew t += 1 if done: obs = env_new.reset() if (isAtari and (info['ale.lives'] == 0)) or (not isAtari): done_test = done if render: env_new.close() total_rewards.append(episode_reward) else: t = 0 episodes = [] episode_reward = 0 while (t < nb_test_steps): action = act_greedy(np.array(obs)[None])[0] obs, rew, done, info = env_new.step(action) episode_reward += rew t += 1 if done: obs = env_new.reset() if (isAtari and (info['ale.lives'] == 0)) or (not isAtari): episodes.append(episode_reward) episode_reward = 0 if not (episodes): episodes.append(episode_reward) total_rewards.append(np.mean(episodes)) return np.array(total_rewards, dtype=np.float32)
def env_creator(env_config): env = envs.make( 'TargetTracking-v0', # render = bool(args.render), # record = bool(args.record), # ros = bool(args.ros), # dirname=directory, map_name=env_config["map_name"], # num_targets=env_config["num_targets"], # im_size=args.im_size, ) return env # return an env instance
def evaluation_ttenv(act, env_id, eval_type='random', nb_itrs=5, render=False, **kwargs): """ Evaluation for the ttenv environments in a given set of different sampling zones. The set of different sampling zones is defined in TTENV_EVAL_SET. """ from ttenv.metadata import TTENV_EVAL_SET, TTENV_EVAL_MULTI_SET if eval_type == 'random': params_set = [{}] elif eval_type == 'random_zone': params_set = TTENV_EVAL_SET if num_targets == 1 else TTENV_EVAL_MULTI_SET elif eval_type == 'fixed': params_set = [{'init_pose_list': kwargs['init_pose_list']}] else: raise ValueError("Wrong evaluation type for ttenv.") env = envs.make(env_id, 'target_tracking', render=render, is_training=False, **kwargs) total_rewards, total_nlogdetcov = [], [] for params in params_set: total_rewards_k, total_nlogdetcov_k = [], [] for _ in range(nb_itrs): obs = env.reset(**params) done = False episode_reward, episode_nlogdetcov, t = 0, 0, 0 while not done: if render: env.render() action = act(np.array(obs)[None])[0] obs, rew, done, info = env.step(action) episode_reward += rew episode_nlogdetcov += info['mean_nlogdetcov'] t += 1 total_rewards_k.append(episode_reward) total_nlogdetcov_k.append(episode_nlogdetcov) total_rewards.append(total_rewards_k) total_nlogdetcov.append(total_nlogdetcov_k) if render: env.close() if len(total_rewards) == 1: total_rewards = total_rewards[0] total_nlogdetcov = total_nlogdetcov[0] return np.array(total_rewards, dtype=np.float32), np.array(total_nlogdetcov, dtype=np.float32)
def main(): env = envs.make(args.env, 'target_tracking', render=True, directory=args.log_dir, map_name=args.map, num_targets=args.nb_targets, is_training=False, ) env_core = env while( not hasattr(env_core, '_elapsed_steps')): env_core = env_core.env env_core = env_core.env from logger import TTENV_TEST_SET_PUB for eval_num in range(len(TTENV_TEST_SET_PUB)): print("TTENV_TEST_SET_PUB: Eval Num %d ..."%eval_num) init_pose = [] target_paths = [] map_info = [] while(len(init_pose) < args.nb_paths): # test episode _, done = env.reset(**TTENV_TEST_SET_PUB[eval_num]), False env_core.has_discovered = [1] * args.nb_targets proceed = False if args.manual_check: env.render() proceed = ("y" == input("%d, Init Pose Pass? (y/n) "%len(init_pose))) if proceed or not(args.manual_check): init_pose_k = {'agent':env_core.agent.state, 'targets':[env_core.targets[i].state for i in range(args.nb_targets)], 'belief_targets':[env_core.belief_targets[i].state for i in range(args.nb_targets)]} target_path_t = [[]] * args.nb_targets while not done: _, _, done, _ = env.step(env.action_space.sample()) if args.render: env.render() for i in range(args.nb_targets): target_path_t[i].append(env_core.targets[i].state) proceed = False if args.manual_check: env.render() proceed = ("y" == input("%d, Pass? (y/n) "%len(init_pose))) if proceed or not(args.manual_check): init_pose.append(init_pose_k) target_paths.append(target_path_t) if args.map == 'dynamic_map': map_info.append({'chosen_idx': env_core.MAP.chosen_idx, 'rot_angs': env_core.MAP.rot_angs }) np.save(open(os.path.join(args.log_dir,'path_%d.npy'%eval_num), 'wb'), target_paths) pickle.dump(init_pose, open(os.path.join(args.log_dir,'init_eval_%d.pkl'%eval_num), 'wb')) if args.map == 'dynamic_map': pickle.dump(map_info, open(os.path.join(args.log_dir, 'map_info_%d.pkl'%eval_num), 'wb'))
def __init__(self, env_name, num, total=20, num_train=15, max_horizon=15): # pass self.env_name = env_name self.num = num self.total = total self.num_train = num_train self.max_horizon = max_horizon self.policy = get_up_network(env_name, num) path = f"{env_name}_{num}" self.eval_env = make(env_name, num) self.data = self._get_data(path)
def test(): env = envs.make(args.env, render = bool(args.render), record = bool(args.record)) act = simple.load(os.path.join(args.log_dir, args.log_fname)) if args.record: env = Monitor(env, directory=args.log_dir) while True: obs, done = env.reset(), False episode_rew = 0 while not done: if args.render: env.render() time.sleep(0.05) obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew)
def test(env_id, act, nb_itrs=5, nb_test_steps=10000, render=False, map_name=None, num_targets=1): total_rewards, total_eval = [], [] for _ in range(nb_itrs): env_new = envs.make(env_id, render, figID=1, is_training=False, map_name=map_name, num_targets=num_targets) obs = env_new.reset() if nb_test_steps is None: done_test = False episode_reward, episode_eval = 0, 0 t = 0 while not done_test: action = act(np.array(obs)[None])[0] obs, rew, done, info = env_new.step(action) if render: env_new.render() episode_reward += rew episode_eval += info['test_reward'] t += 1 if done: obs = env_new.reset() done_test = done if render: env_new.close() total_rewards.append(episode_reward) total_eval.append(episode_eval) else: t = 0 rewards, evals = [], [] episode_reward, episode_eval = 0, 0 episode_eval = 0 while(t < nb_test_steps): action = act(np.array(obs)[None])[0] obs, rew, done, info = env_new.step(action) episode_reward += rew episode_eval += info['test_reward'] t += 1 if done: obs = env_new.reset() rewards.append(episode_reward) evals.append(episode_eval) episode_reward, episode_eval = 0, 0 if not(episodes): rewards.append(episode_reward) evals.append(episode_eval) total_rewards.append(np.mean(rewards)) total_eval.append(np.mean(evals)) return np.array(total_rewards, dtype=np.float32), np.array(total_eval, dtype=np.float32)
def test(): env = make('DartHopperPT-v1', num=5) """ env.reset() for i in tqdm.trange(10000): env.step(env.action_space.sample()) """ env.reset() state = get_state(env) for i in tqdm.trange(10000): env.reset() set_state(env, state) state = state + np.random.normal(state.shape) env.step(env.action_space.sample()) state = get_state(env)
def train(seed, save_dir): set_global_seeds(seed) save_dir_0 = os.path.join(save_dir, 'seed_%d' % seed) os.makedirs(save_dir_0) env = envs.make(args.env, 'classic_control') with tf.device(args.device): with tf.compat.v1.variable_scope('seed_%d' % seed): model = models.mlp([args.num_units] * args.num_layers, init_mean=args.init_mean, init_sd=args.init_sd) act = deepadfq.learn( env, q_func=model, lr=args.learning_rate, lr_decay_factor=args.learning_rate_decay_factor, lr_growth_factor=args.learning_rate_growth_factor, max_timesteps=args.nb_train_steps, buffer_size=args.buffer_size, batch_size=args.batch_size, exploration_fraction=args.eps_fraction, exploration_final_eps=args.eps_min, target_network_update_freq=args.target_update_freq, print_freq=args.nb_epoch_steps, checkpoint_freq=int(args.nb_train_steps / 5), learning_starts=args.nb_warmup_steps, gamma=args.gamma, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, callback=None, #callback, alg=args.alg, scope=args.scope, sdMin=np.sqrt(args.varth), noise=args.noise, act_policy=args.act_policy, epoch_steps=args.nb_epoch_steps, eval_logger=Logger(args.env, 'classic_control', save_dir=save_dir_0, render=bool(args.render)), save_dir=save_dir_0, test_eps=args.test_eps, gpu_memory=args.gpu_memory, render=bool(args.render), ) if args.record == 1: env.moviewriter.finish()
def test(): set_global_seeds(args.seed) import json if args.env == 'TargetTracking-v5': import simple_imtracking as simple else: import simple_tracking as simple learning_prop = json.load( open(os.path.join(args.log_dir, 'learning_prop.json'), 'r')) env = envs.make( args.env, render=bool(args.render), record=bool(args.record), ros=bool(args.ros), map_name=args.map, num_targets=learning_prop['nb_targets'], dirname=args.log_dir, is_training=True, im_size=args.im_size, ) act_params = {'scope': learning_prop['scope'], 'eps': args.test_eps} act = simple.load(os.path.join(args.log_dir, args.log_fname), act_params) if args.ros_log: from envs.target_tracking.ros_wrapper import RosLog log = RosLog(num_targets=args.nb_targets, wrapped_num=args.ros + args.render + args.record + 1) t = 0 while (t < args.nb_test_steps): # test episode t += 1 obs, done = env.reset(), False episode_rew = 0 while not done: if args.render: env.render() if args.ros_log: log.log(env) obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew) if args.record: env.moviewriter.finish() if args.ros_log: log.save(args.log_dir)
def main(): env = envs.make( args.env, 'target_tracking', render=bool(args.render), directory=args.log_dir, map_name=args.map, num_targets=args.nb_targets, is_training=False, ) timelimit_env = env while (not hasattr(timelimit_env, '_elapsed_steps')): timelimit_env = timelimit_env.env init_pose = [] params = {} # This is an example. Please change this if necessary. # from logger import TTENV_EVAL_SET # params = TTENV_EVAL_SET[0] while (len(init_pose) < args.nb_init_pose): # test episode _, done = env.reset(**params), False if args.render: env.render() notes = input("%d, Pass? y/n" % len(init_pose)) if notes == "y": init_pose.append({ 'agent': timelimit_env.env.agent.state, 'targets': [ timelimit_env.env.targets[i].state for i in range(args.nb_targets) ], 'belief_targets': [ timelimit_env.env.belief_targets[i].state for i in range(args.nb_targets) ] }) pickle.dump( init_pose, open(os.path.join(args.log_dir, 'init_pose_random_1015.pkl'), 'wb'))
def main(): env = envs.make( args.env, 'ma_target_tracking', render=bool(args.render), directory=args.log_dir, map_name=args.map, num_agents=args.nb_agents, num_targets=args.nb_targets, is_training=False, ) timelimit_env = env while (not hasattr(timelimit_env, '_elapsed_steps')): timelimit_env = timelimit_env.env init_pose = [] while (len(init_pose) < args.nb_init_pose): # test episode obs, done = env.reset(), False if args.render: env.render() notes = input("%d, Pass? y/n" % len(init_pose)) if notes == "y": init_pose.append({ 'agents': [ timelimit_env.env.agents[i].state for i in range(args.nb_agents) ], 'targets': [ timelimit_env.env.targets[i].state for i in range(args.nb_targets) ], 'belief_targets': [ timelimit_env.env.belief_targets[i].state for i in range(args.nb_targets) ] }) pickle.dump( init_pose, open(os.path.join(args.log_dir, 'init_pose_random_1015.pkl'), 'wb'))
def test(): env = envs.make(args.env, 'classic_control', render=bool(args.render), record=bool(args.record), directory=args.log_dir) learning_prop = json.load( open(os.path.join(args.log_dir, '../learning_prop.json'), 'r')) act_params = { 'scope': "seed_%d" % learning_prop['seed'] + "/" + learning_prop['scope'], 'eps': args.test_eps } act = deepq.load(os.path.join(args.log_dir, args.log_fname), act_params) while True: obs, done = env.reset(), False episode_rew = 0 while not done: if args.render: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) episode_rew += rew print("Episode reward", episode_rew)
def __init__(self, scene, discount, initQ, TH, memory_size): """Tabular RL Parameters ---------- scene : A name of a task you want to test. (See models.py) alpha : learning rate of Q-learning discount : discount factor in MDP initQ : initial Q value (initialize all Q values with the same number) TH : finite-time horizon (maximum learning steps) memory_size : Experience Replay memory size """ self.env = envs.make(scene) self.discount = discount self.states, self.actions, self.rewards = [], [], [] self.visits = np.zeros((self.env.snum, self.env.anum), dtype=np.int) self.np_random, _ = seeding.np_random(None) self.test_counts = [] self.test_rewards = [] self.dim = (self.env.snum, self.env.anum) if initQ is None: self.init_params() else: self.Q = initQ * np.ones(self.dim, dtype=float) if hasattr(self.env, 'terminal_states'): for ts in self.env.terminal_states: self.Q[ts, :] = 0.0 self.Q_err = [] self.Q_target = np.array(self.env.optQ(self.discount)).astype( np.float16) self.memory_size = memory_size self.replayMem = {(i, j): [] for i in range(self.env.snum) for j in range(self.env.anum)} if TH is not None: self.env.set_time(TH)
def evaluation(act, env_id, env_type, nb_test_steps=None, nb_itrs=5, render=False, **kwargs): """Evaluate the current model with a semi-greedy action policy. Parameters ------- act: ActWrapper Wrapper over act function. Action policy for the evaluation. env_id: str name of an environment. (e.g. CartPole-v0) env_type: str type of an environment. (e.g. 'atari', 'classic_control', 'target_tracking') nb_test_steps: int the number of steps for the evaluation at each iteration. If None, it evaluates until an episode ends. nb_itrs: int the number of test iterations. render: bool display if True. Returns ------- total_rewards: np.array with shape=(nb_itrs,) cumulative rewards. total_nlogdetcov : np.array with shape=(nb_itrs,) cumulative negative mean of logdetcov only for a target tracking env. """ total_rewards = [] env = envs.make(env_id, env_type, render=render, is_training=False, **kwargs) for _ in range(nb_itrs): obs = env.reset() if nb_test_steps is None: # Evaluate until an episode ends. done = False episode_reward, t = 0, 0 while not done: if render: env.render() import pdb;pdb.set_trace() action = act(np.array(obs)[None])[0] obs, rew, done, info = env.step(action) episode_reward += rew t += 1 if done and (env_type=='atari') and (info['ale.lives'] != 0): done = False total_rewards.append(episode_reward) else: t, episode_reward = 0, 0 episodes = [] while(t < nb_test_steps): if render: env.render() action = act(np.array(obs)[None])[0] obs, rew, done, info = env.step(action) episode_reward += rew t += 1 if done: obs = env.reset() if ((env_type=='atari') and (info['ale.lives'] == 0)) or not(env_type=='atari'): episodes.append(episode_reward) episode_reward = 0 if not(episodes): episodes.append(episode_reward) total_rewards.append(np.mean(episodes)) if render: env.close() return np.array(total_rewards, dtype=np.float32), None
def train(seed, save_dir): set_global_seeds(seed) save_dir_0 = os.path.join(save_dir, 'seed_%d'%seed) os.makedirs(save_dir_0) env = envs.make(args.env, 'target_tracking', render=bool(args.render), record=bool(args.record), directory=save_dir_0, ros=bool(args.ros), map_name=args.map, num_targets=args.nb_targets, im_size=args.im_size, ) with tf.device(args.device): with tf.compat.v1.variable_scope('seed_%d'%seed): hiddens = args.hiddens.split(':') hiddens = [int(h) for h in hiddens] if args.env == 'TargetTracking-v5': model = models.cnn_plus_mlp( convs=[(4, 8, 4), (8, 4, 2)], hiddens= hiddens, dueling=bool(args.dueling), init_mean = args.init_mean, init_sd = args.init_sd, inpt_dim = (args.im_size, args.im_size), ) else: model = models.mlp(hiddens, init_mean=args.init_mean, init_sd=args.init_sd) act = deepadfq.learn( env, q_func=model, lr=args.learning_rate, lr_decay_factor=args.learning_rate_decay_factor, lr_growth_factor=args.learning_rate_growth_factor, max_timesteps=args.nb_train_steps, buffer_size=args.buffer_size, batch_size=args.batch_size, exploration_fraction=args.eps_fraction, exploration_final_eps=args.eps_min, target_network_update_freq=args.target_update_freq, checkpoint_freq=args.checkpoint_freq, learning_starts=args.nb_warmup_steps, gamma=args.gamma, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, callback=None,#callback, alg=args.alg, scope=args.scope, sdMin=np.sqrt(args.varth), noise=args.noise, act_policy=args.act_policy, epoch_steps=args.nb_epoch_steps, eval_logger=Logger(args.env, env_type='target_tracking', save_dir=save_dir_0, render=bool(args.render), figID=1, ros=bool(args.ros), map_name=args.map, num_targets=args.nb_targets, im_size=args.im_size, eval_type=args.eval_type, init_file_path=args.init_file_path, ), save_dir=save_dir_0, test_eps=args.test_eps, gpu_memory=args.gpu_memory, render=(bool(args.render) or bool(args.ros)), ) print("Saving model to model.pkl") act.save(os.path.join(save_dir_0, "model.pkl")) if args.record == 1: env.moviewriter.finish()
def test(): learning_prop = json.load(open(os.path.join(args.log_dir, '../learning_prop.json'),'r')) env = envs.make(args.env, 'target_tracking', render=bool(args.render), record=bool(args.record), directory=args.log_dir, ros=bool(args.ros), map_name=args.map, num_targets=learning_prop['nb_targets'], im_size=learning_prop['im_size'], is_training=False, ) timelimit_env = env while( not hasattr(timelimit_env, '_elapsed_steps')): timelimit_env = timelimit_env.env act_params = {'scope': "seed_%d"%learning_prop['seed']+"/"+learning_prop['scope'], 'eps': args.test_eps} act = deepadfq.load(os.path.join(args.log_dir, args.log_fname), act_params) if args.ros_log: from envs.target_tracking.ros_wrapper import RosLog ros_log = RosLog(num_targets=args.nb_targets, wrapped_num=args.ros + args.render + args.record + 1) ep = 0 ep_nlogdetcov = ['Episode nLogDetCov'] time_elapsed = ['Elapsed Time (sec)'] given_init_pose, test_init_pose = [], [] # Use a fixed set of initial positions if given. if args.init_file_path != '.': import pickle given_init_pose = pickle.load(open(args.init_file_path, "rb")) while(ep < args.nb_test_steps): # test episode ep += 1 episode_rew, nlogdetcov = 0, 0 obs, done = env.reset(init_pose_list=given_init_pose), False test_init_pose.append({'agent':timelimit_env.env.agent.state, 'targets':[timelimit_env.env.targets[i].state for i in range(args.nb_targets)], 'belief_targets':[timelimit_env.env.belief_targets[i].state for i in range(args.nb_targets)]}) s_time = time.time() while not done: if args.render: env.render() if args.ros_log: ros_log.log(env) obs, rew, done, info = env.step(act(obs[None])[0]) episode_rew += rew nlogdetcov += info['mean_nlogdetcov'] time_elapsed.append(time.time() - s_time) ep_nlogdetcov.append(nlogdetcov) print("Ep.%d - Episode reward : %.2f, Episode nLogDetCov : %.2f"%(ep, episode_rew, nlogdetcov)) if args.record : env.moviewriter.finish() if args.ros_log : ros_log.save(args.log_dir) import pickle, tabulate pickle.dump(test_init_pose, open(os.path.join(args.log_dir,'test_init_pose.pkl'), 'wb')) f_result = open(os.path.join(args.log_dir, 'test_result.txt'), 'w') f_result.write(tabulate.tabulate([ep_nlogdetcov, time_elapsed], tablefmt='presto')) f_result.close()