def optimize_params(self, trial, n_prune_evals_per_trial: int = 2, n_tests_per_eval: int = 1): train_provider, test_provider = self.data_provider.split_data_train_test(self.train_split_percentage) train_provider, validation_provider = train_provider.split_data_train_test(self.train_split_percentage) del test_provider train_env = SubprocVecEnv([make_env(train_provider, i) for i in range(1)]) validation_env = SubprocVecEnv([make_env(validation_provider, i) for i in range(1)]) model_params = self.optimize_agent_params(trial) model = self.Model(self.Policy, train_env, verbose=self.model_verbose, nminibatches=1, tensorboard_log=self.tensorboard_path, **model_params) last_reward = -np.finfo(np.float16).max n_steps_per_eval = int(len(train_provider.data_frame) / n_prune_evals_per_trial) for eval_idx in range(n_prune_evals_per_trial): try: model.learn(n_steps_per_eval) except AssertionError: raise rewards = [] n_episodes, reward_sum = 0, 0.0 trades = train_env.get_attr('trades') if len(trades[0]) < 1: self.logger.info('Pruning trial for not making any trades: ', eval_idx) raise optuna.structs.TrialPruned() state = None obs = validation_env.reset() while n_episodes < n_tests_per_eval: action, state = model.predict(obs, state=state) obs, reward, done, _ = validation_env.step([action]) reward_sum += reward[0] if all(done): rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = validation_env.reset() last_reward = np.mean(rewards) trial.report(-1 * last_reward, eval_idx) if trial.should_prune(eval_idx): raise optuna.structs.TrialPruned() return -1 * last_reward
def evaluate_model_on_set( set_path, model, config_path=None, config_kw=None, metrics=("success", "control_variation", "rise_time", "overshoot", "settling_time"), norm_data_path=None, num_envs=1, turbulence_intensity="none", use_pid=False, writer=None, timestep=None, ): """ :param set_path: (str) path to test set file :param model: (PPO2 object or [PIDController]) the controller to be evaluated :param config_path: (str) path to gym environment configuration file :param config_kw: (dict) dictionary of key value pairs to override settings in the configuration file of the gym environment :param metrics: ([str]) list of metrics to be computed and recorded :param norm_data_path: (str) path to folder containing normalization statistics :param num_envs: (int) number of gym environments to run in parallell using multiprocessing :param turbulence_intensity: (str) the intensity setting of the wind turbulence :param use_pid: (bool) Whether the evaluated controller is a PID controller or not :param writer: (tensorboard writer) If supplied, evaluation results will be written to tensorboard log, if not, results are printed to standard output :param timestep: (int) What timestep results are written to when using tensorboard logging :return: (dict) the metrics computed for the evaluated controller on the test set """ scenarios = list(np.load(set_path, allow_pickle=True)) scenario_count = len(scenarios) if config_kw is None: config_kw = {} config_kw.update({ "steps_max": 1500, "target": { "on_success": "done", "success_streak_fraction": 1, "success_streak_req": 100, "states": { 0: { "bound": 5 }, 1: { "bound": 5 }, 2: { "bound": 2 } }, }, }) if use_pid: config_kw["action"] = {"scale_space": False} sim_config_kw = { "turbulence": turbulence_intensity != "None", "turbulence_intensity": turbulence_intensity, } test_env = SubprocVecEnv([ make_env(config_path, i, config_kw=config_kw, sim_config_kw=sim_config_kw) for i in range(num_envs) ]) if use_pid: dt = test_env.get_attr("simulator")[0].dt for pid in model: pid.dt = dt env_cfg = test_env.get_attr("cfg")[0] obs_states = [var["name"] for var in env_cfg["observation"]["states"]] try: phi_i, theta_i, Va_i = ( obs_states.index("roll"), obs_states.index("pitch"), obs_states.index("Va"), ) omega_i = [ obs_states.index("omega_p"), obs_states.index("omega_q"), obs_states.index("omega_r"), ] except ValueError: print( "When using PID roll, pitch, Va, omega_p, omega_q, omega_r must be part of the observation vector." ) else: test_env = VecNormalize(test_env) if model.env is not None: test_env.obs_rms = model.env.obs_rms test_env.ret_rms = model.env.ret_rms else: assert norm_data_path is not None test_env.load_running_average(norm_data_path) test_env.training = False res = {metric: {} for metric in metrics} res["rewards"] = [[] for i in range(scenario_count)] active_envs = [i < scenario_count for i in range(num_envs)] env_scen_i = [i for i in range(num_envs)] test_done = False obs = np.array( [np.zeros(test_env.observation_space.shape) for i in range(num_envs)]) done = [True for i in range(num_envs)] info = None while not test_done: for i, env_done in enumerate(done): if env_done: if len(scenarios) > 0 or active_envs[i]: if len(scenarios) > 0: print("{}/{} scenarios left".format( len(scenarios), scenario_count)) scenario = scenarios.pop(0) env_scen_i[i] = (scenario_count - 1) - len(scenarios) obs[i] = test_env.env_method("reset", indices=i, **scenario)[0] if use_pid: model[i].reset() model[i].set_reference( scenario["target"]["roll"], scenario["target"]["pitch"], scenario["target"]["Va"], ) else: active_envs[i] = False if info is not None: for metric in metrics: if isinstance(info[i][metric], dict): for state, value in info[i][metric].items(): if state not in res[metric]: res[metric][state] = [] res[metric][state].append(value) else: if "all" not in res[metric]: res[metric]["all"] = [] res[metric]["all"].append(info[i][metric]) if len(scenarios) == 0: test_done = not any(active_envs) if use_pid: actions = [] for i, pid in enumerate(model): roll, pitch, Va = obs[i, phi_i], obs[i, theta_i], obs[i, Va_i] omega = obs[i, omega_i] if info is not None and "target" in info[i]: pid.set_reference( phi=info[i]["target"]["roll"], theta=info[i]["target"]["pitch"], va=info[i]["target"]["Va"], ) actions.append(pid.get_action(roll, pitch, Va, omega)) actions = np.array(actions) else: actions, _ = model.predict(obs, deterministic=True) obs, rew, done, info = test_env.step(actions) for i, env_rew in enumerate(rew): res["rewards"][env_scen_i[i]].append(env_rew) if writer is not None: summaries = [] for metric, metric_v in res.items(): if isinstance(res[metric], dict): for state, v in res[metric].items(): summaries.append( tf.Summary.Value( tag="test_set/{}_{}".format(metric, state), simple_value=np.nanmean(v), )) writer.add_summary(tf.Summary(value=summaries), timestep) else: print_results(res) return res
def main(): args = get_configuration() args.state_dim = util.get_state_dim(args) if not os.path.exists(args.outdir): os.makedirs(args.outdir, exist_ok=True) if args.graph_embedding: class MyPolicy(EmbeddingPolicy): def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=True, **_kwargs): super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, args, reuse=reuse, **_kwargs) else: class MyPolicy(EnigmaPolicy): def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=True, **_kwargs): super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, args, reuse=reuse, **_kwargs) t0 = time.time() from mpi4py import MPI as mpi comm = mpi.COMM_WORLD rank = comm.Get_rank() all = comm.Get_size() gpus = os.environ["CUDA_VISIBLE_DEVICES"].split(',') gpu_count = len(gpus) gpu = gpus[rank % gpu_count] os.environ["CUDA_VISIBLE_DEVICES"] = gpu print("My rank is {} out of {}, using GPU {}".format(rank, all, gpu)) if args.model_type == "ppo2": from stable_baselines import PPO2 as PPO env = SubprocVecEnv([(lambda: ProofEnv.ProofEnv(args)) for _ in range(args.parallel_envs) ]) #, start_method="spawn") elif args.model_type == "ppo1": args.parallel_envs = 1 env = DummyVecEnv([lambda: ProofEnv.ProofEnv(args)]) # from stable_baselines import PPO1 as PPO from ppo import PPO1 as PPO if args.saved_model == None: myPolicy = MyPolicy if args.model_type == "ppo2": model = PPO( policy=myPolicy, env=env, n_steps=args.actorbatch, # nminibatches=args.optim_stepsize, lam=0.95, gamma=args.gamma, noptepochs=4, ent_coef=args.entcoeff, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) elif args.model_type == "ppo1": model = PPO(myPolicy, env, verbose=2, timesteps_per_actorbatch=args.actorbatch, schedule=args.lr_schedule, optim_stepsize=args.optim_stepsize, entcoeff=args.entcoeff, optim_batchsize=args.optim_batchsize, gamma=args.gamma) else: print("Loading model from {}".format(args.saved_model)) model = PPO.load(args.saved_model) model.set_env(env) counter = 0 for ind in range(args.parallel_envs): env.env_method("set_model", model, indices=list(range(args.parallel_envs))) modelfiles = [] for train_timestep, train_dir in zip(args.train_timesteps, args.train_dirs): problem_files = sorted(util.list_problems(train_dir)) problem_files = util.split_list(problem_files, all)[rank] problem_files_splitted = util.split_list(problem_files, args.parallel_envs, extensible=False) if args.add_repeating_pretraining: for ind in range(args.parallel_envs): env.env_method("set_source", problem_files_splitted[ind], indices=[ind], generator_type="repeating") # all_thread_timestep = train_timestep * all print("PRETRAINING") model.learn(total_timesteps=train_timestep) print("Pretraining on {} finished in {}".format( train_dir, util.format_time(time.time() - t0))) for ind in range(args.parallel_envs): env.env_method("set_source", problem_files_splitted[ind], indices=[ind]) # all_thread_timestep = train_timestep * all model.learn(total_timesteps=train_timestep) modelfile = "{}/ppo1_fcop_train_{}".format(args.outdir, counter) modelfiles.append(modelfile) if rank == 0: model.save(modelfile) # logger.logkv("finished_train_problems", counter) counter += 1 print("Training on {} finished in {}".format( train_dir, util.format_time(time.time() - t0))) statistics_list = env.get_attr("statistics", indices=list(range(args.parallel_envs))) blacklist_list = env.get_attr("blacklist", indices=list(range(args.parallel_envs))) for i, statistics in enumerate(statistics_list): print("ENV {} - {} - blacklist: {}\n".format( rank, i, blacklist_list[i])), util.print_problemdict(statistics, rank) # for f in statistics: # statistics[f]["mcts"].display_tree([0]) # util.print_problemdict(env.envs[0].statistics) if len(args.train_dirs) > 0 and len( args.train_timesteps) > 0: # we did training print("We have finished training, rank {}".format(rank)) # for p in problem_files: # vis_policy.vis_policy(env.envs[0], model, p) env.close() del env del model # here we wait for everyone comm.Barrier() print("We have started evaluation, rank {}".format(rank)) # evaluation without training if (args.saved_model is not None) and (len( args.train_dirs) == 0): # no training, just evaluation modelfiles = [args.saved_model] for evaldir in args.evaldirs: for model_index, modelfile in enumerate(modelfiles): eval.eval_mpi(args, evaldir, modelfile, model_index) # here we wait for everyone comm.Barrier()
class Agent: def __init__(self, version, envs, hours = 0, verbose = False, weights = None): self.version = version self.name = "football-ppo{}".format(version) + "-e{}" self.path = "models/football-ppo-{}/".format(version) self.defaults = { "env_name": "", "representation": "simple115", "rewards": "scoring", "render": False, "write_video": False, "dump_frequency": 1, "extra_players": None, "number_of_left_players_agent_controls": 1, "number_of_right_players_agent_controls": 0, "enable_sides_swap": False, "parallel": 1 } self.configs = list(map(lambda b: dict(map(lambda a: (a[0], a[1] if a[0] not in b.keys() else b[a[0]]), self.defaults.items())), envs)) self.training = SubprocVecEnv(reduce(lambda a, b: a + b, list(map(lambda config: [ lambda: football.create_environment( env_name = config["env_name"], representation = config["representation"], rewards = config["rewards"], render = config["render"], write_video = config["write_video"], dump_frequency = config["dump_frequency"], extra_players = config["extra_players"], number_of_left_players_agent_controls = config["number_of_left_players_agent_controls"], number_of_right_players_agent_controls = config["number_of_right_players_agent_controls"], enable_sides_swap = config["enable_sides_swap"] ) for _ in range(config["parallel"]) ], self.configs)), [])) self.inputs = self.training.get_attr("observation_space")[0].shape[0] self.outputs = self.training.get_attr("action_space")[0].n self.verbose = verbose if not verbose: os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" deprecation._PRINT_DEPRECATION_WARNINGS = False logger = logging.getLogger() logger.setLevel(logging.ERROR) if weights == None: self.model = PPO2(policy = MlpPolicy, env = self.training, verbose = int(self.verbose)) else: self.model = PPO2.load(weights, env = self.training, learning_rate = 0.002) self.experience = hours * 60 def duration(self, time): return "{:02}:{:02}".format(time // 60, time % 60) def progress(self, current, total): return "{}{}".format("#" * int(current / (total / 10)), " " * (10 - int(current / (total / 10)))) def blank(self): return [("", "")] def separator(self, width = 36, inset = " "): return [inset + ("-" * width)] def section(self, *, values, width = 36, inset = " "): rows = [] reserved = 5 space = lambda a: len(str(a[0])) + len(str(a[1])) lengths = list(map(lambda a: space(a), values)) width = max(lengths + [width - reserved]) for (name, value), length in zip(values, lengths): rows.append(inset + "| " + name + (" " * (width - length)) + " " + str(value) + " |") return rows def dump(self, lines): with open(os.path.join(self.path, "results.txt"), "a+") as dump: for line in lines: dump.write(line + ("\r\n" if line != "\r\n" else "")) def train(self, *, epoch, episodes, verbose): inset = " " start = datetime.datetime.now() counts = list(map(lambda a: a["parallel"], self.configs)) stochastics = ["11_vs_11_stochastic", "11_vs_11_easy_stochastic", "11_vs_11_hard_stochastic"] expand = lambda values, counts: reduce(lambda a, b: a + b, map(lambda c: [c[0]] * c[1], zip(values, counts)), []) results = Results(indexes = expand(list(map(lambda a: a["env_name"] in stochastics, self.configs)), counts)) parallel = sum(counts) self.model.set_env(self.training) with output(initial_len = 4 if not verbose else 20 + results.count, interval = 0) as lines: lines[0] = "\n" lines[3] = "\n" lines[1] = "{}Epoch {}".format(inset, epoch) def callback(a, b): matches = self.training.get_attr("last_observation") results.temps(matches) update( clock = int((3000 - matches[0][0]["steps_left"]) * 1.8), scores = list(map(lambda score: "{}:{}".format(score[0], score[1]), results.scores(matches))) ) def update(*, clock, scores = None): if not verbose: return if scores == None: scores = ["0:0"] * results.count matches = list(map(lambda a: "Match {}".format(a), range(1, results.count + 1))) table = reduce(lambda a, b: a + b, [ self.separator(), self.section(values = (results.results() + self.blank() + results.goals() + self.blank() + [("Time", self.duration((datetime.datetime.now() - start).seconds)), ("Experience", self.duration(self.experience + int((clock / 60) * parallel))), ("Match Clock", self.duration(clock))])), self.separator(), self.section(values = list(zip(matches, scores))), self.separator() ], []) for index, row in enumerate(table): lines[4 + index] = row for episode in range(1, episodes + 1): lines[2] = "{}Episode {} of {} - [{}]".format(inset, episode, episodes, self.progress(episode, episodes)) update(clock = 0) self.model.learn(total_timesteps = 3000 * parallel, callback = callback) matches = self.training.get_attr("last_observation") results.record(matches = matches) update(clock = 5400, scores = list(map(lambda a: "{}:{}".format(a[0], a[1]), results.scores(matches)))) self.experience += parallel * 90 time.sleep(1) self.dump(lines) def watch(self, *, env, matches, weights, record): environment = SubprocVecEnv([ lambda: football.create_environment( env_name = "11_vs_11_easy_stochastic", representation = self.configs[0]["representation"], rewards = self.configs[0]["rewards"], enable_goal_videos = False, enable_full_episode_videos = True, render = True, write_video = record, dump_frequency = 1, logdir = "/home/charlie/Projects/Python/Football/videos/", extra_players = self.configs[0]["extra_players"], number_of_left_players_agent_controls = self.configs[0]["number_of_left_players_agent_controls"], number_of_right_players_agent_controls = self.configs[0]["number_of_right_players_agent_controls"], enable_sides_swap = self.configs[0]["enable_sides_swap"] ) for _ in range(1) ]) # self.model.set_env(environment) watch = PPO2.load(weights, env = environment) for match in range(matches): watch.learn(total_timesteps = 3100) def run(self, *, epochs, episodes, verbose = True): if os.path.exists(self.path): if len(os.listdir(self.path)) > 0: print("Directory: {} is not empty. Please make sure you are not overwriting existing models and try again.".format(self.path)) return else: os.mkdir(self.path) for epoch in range(1, epochs): self.train(epoch = epoch, episodes = episodes, verbose = verbose) self.model.save(os.path.join(self.path, self.name.format(epoch))) self.watch(env = "11_vs_11_stochastic", matches = 1, weights = os.path.join(self.path, self.name.format(epoch)), record = True)
def main(args): log_dir = args.log_path if (args.log_path is not None) else \ "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S') configure_logger(log_dir) set_global_seeds(args.seed) n_cpu = get_num_workers(args.env) if not args.play else 1 env_kwargs = get_env_kwargs(args.env, args.random_ratio, args.sequential, args.reward_type, args.n_object, args.curriculum) def make_thunk(rank): return lambda: make_env(env_id=args.env, rank=rank, log_dir=log_dir, flatten_dict=True, kwargs=env_kwargs) env = SubprocVecEnv([make_thunk(i) for i in range(n_cpu)]) eval_env_kwargs = env_kwargs.copy() eval_env_kwargs['random_ratio'] = 0.0 if "use_cu" in eval_env_kwargs: eval_env_kwargs['use_cu'] = False eval_env = make_env(env_id=args.env, rank=0, flatten_dict=True, kwargs=eval_env_kwargs) print(eval_env) if not args.play: os.makedirs(log_dir, exist_ok=True) train_kwargs = get_train_kwargs("ppo", args, parsed_action_noise=None, eval_env=eval_env) # policy = 'MlpPolicy' from utils.attention_policy import AttentionPolicy register_policy('AttentionPolicy', AttentionPolicy) policy_kwargs = get_policy_kwargs("ppo", args) print(policy_kwargs) model = PPO2(args.policy, env, verbose=1, nminibatches=32, lam=0.95, noptepochs=10, ent_coef=0.01, learning_rate=3e-4, cliprange=0.2, policy_kwargs=policy_kwargs, **train_kwargs) print(model.get_parameter_list()) def callback(_locals, _globals): num_update = _locals["update"] if 'FetchStack' in args.env: mean_eval_reward = stack_eval_model(eval_env, _locals["self"]) else: mean_eval_reward = eval_model(eval_env, _locals["self"]) log_eval(num_update, mean_eval_reward) if num_update % 10 == 0: model_path = os.path.join(log_dir, 'model_' + str(num_update // 10)) model.save(model_path) print('model saved to', model_path) return True model.learn(total_timesteps=int(args.num_timesteps), callback=callback, seed=args.seed, log_interval=1) model.save(os.path.join(log_dir, 'final')) else: assert args.load_path is not None model = PPO2.load(args.load_path) fig, ax = plt.subplots(1, 1, figsize=(8, 8)) obs = env.reset() goal_dim = env.get_attr('goal')[0].shape[0] if 'FetchStack' in args.env: while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \ env.get_attr('task_mode')[0] != 1: obs = env.reset() elif 'FetchPush' in args.env: while not (1.25 < obs[0][6] < 1.33 and obs[0][7] < 0.61 and 0.7 < obs[0][4] < 0.8): obs = env.reset() env.env_method('set_goal', np.array([1.2, 0.75, 0.425, 1, 0])) obs = env.env_method('get_obs') obs[0] = np.concatenate([ obs[0][key] for key in ['observation', 'achieved_goal', 'desired_goal'] ]) else: while np.argmax(obs[0][-goal_dim + 3:]) != 0: obs = env.reset() print('achieved_goal', obs[0][-2 * goal_dim:-goal_dim], 'goal', obs[0][-goal_dim:]) episode_reward = 0.0 num_episode = 0 frame_idx = 0 images = [] if 'max_episode_steps' not in env_kwargs.keys(): env_kwargs['max_episode_steps'] = 100 for i in range(env_kwargs['max_episode_steps'] * 10): img = env.render(mode='rgb_array') ax.cla() ax.imshow(img) if env.get_attr('goal')[0].shape[0] <= 3: ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx)) else: ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx) + ', goal idx ' + str(np.argmax(env.get_attr('goal')[0][3:]))) if 'FetchStack' in args.env: tasks = ['pick and place', 'stack'] ax.set_title('episode ' + str(num_episode) + ', frame ' + str(frame_idx) + ', task: ' + tasks[np.argmax(obs[0][-2 * goal_dim - 2:-2 * goal_dim])]) images.append(img) action, _ = model.predict(obs) obs, reward, done, _ = env.step(action) episode_reward += reward frame_idx += 1 if not args.export_video: plt.pause(0.1) else: plt.imsave( os.path.join(os.path.dirname(args.load_path), 'tempimg%d.png' % i), img) if done: print('episode_reward', episode_reward) if 'FetchStack' in args.env: while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \ env.get_attr('task_mode')[0] != 1: obs = env.reset() else: while np.argmax(obs[0][-goal_dim + 3:]) != 0: obs = env.reset() print('goal', obs[0][-goal_dim:]) episode_reward = 0.0 frame_idx = 0 num_episode += 1 if num_episode >= 10: break if args.export_video: os.system('ffmpeg -r 5 -start_number 0 -i ' + os.path.dirname(args.load_path) + '/tempimg%d.png -c:v libx264 -pix_fmt yuv420p ' + os.path.join(os.path.dirname(args.load_path), args.env + '.mp4')) for i in range(env_kwargs['max_episode_steps'] * 10): try: os.remove( os.path.join(os.path.dirname(args.load_path), 'tempimg' + str(i) + '.png')) except: pass
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1" results = [] config["max_speed"] = 5 config["max_obs_range"] = 3 n_cpu = 32 model = PPO2.load( "./obs_range/ppo2_default_{}.zip".format(config["max_obs_range"]) ) # icies=["logs/best_model.zip"], **config) for _ in range(n_cpu)]) subpolicies = ["obs_range/ppo2_default_{}".format(config["max_obs_range"])] env = SubprocVecEnv( [lambda: NavigationEnvDefault(**config) for _ in range(n_cpu)]) # model = PPO2(policy="MlpPolicy", env=env) scores = [] obs = env.reset() for j in range(10000): actions, _ = model.predict(obs) obs, reward, done, info = env.step(actions) if j % 100 == 0: print(j, "/", 10000) for i in range(n_cpu): scores = scores + env.get_attr("last_score", i) with open("./obs_range/scores.csv", "a") as f: f.writelines("local" + "," + str(np.mean(scores)) + "\n") env.close() del env
episode_reward = 0 while True: # env.set_attr("keyboard_u", keyboard_u) env.render() action, _states = model.predict(obs, deterministic=True) action[0] = 0 obs, rewards, dones, info = env.step(action) episode_reward += rewards[0] if dones[0]: performance[cnt, 0] = episode_reward episode_reward = 0 performance[cnt, 1] = env.get_attr("record_count")[0] # print(env.get_attr("record_count")) performance[cnt, 2] = env.env_method("why_done")[0] # print(env.env_method("why_done")) if int(performance[cnt, 2]) != 0: performance[cnt, 1] = np.inf cnt += 1 break print(performance) print(np.mean(performance[:, 0]), np.min(performance[:, 1]) * 0.1, np.max(performance[:, 1]) * 0.1, np.mean(performance[:, 1]) * 0.1, len(performance[performance[:, 2] == 0]), len(performance[performance[:, 2] == 1]),