def train(env_id, num_timesteps, seed, policy, attack=False, n_envs=8, nminibatches=4, n_steps=128): model = PPO2.load("model.pkl") env = VecFrameStack(make_atari_env(env_id, n_envs, seed), 4) if attack: env = VecFrameStack( make_adversarial_atari_env(env_id, n_envs, seed, model), 4) policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] # model = PPO2(policy=policy, env=env, n_steps=n_steps, nminibatches=nminibatches, # lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, # learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) model.learn(total_timesteps=num_timesteps) model.save("model") env.close() # Free memory del model
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env): """ Train A2C model for atari environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') :param num_env: (int) The number of environments """ policy_fn = None if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = CnnLstmPolicy elif policy == 'lnlstm': policy_fn = CnnLnLstmPolicy if policy_fn is None: raise ValueError("Error: policy {} not implemented".format(policy)) env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) model = A2C(policy_fn, env, lr_schedule=lr_schedule, seed=seed) model.learn(total_timesteps=int(num_timesteps * 1.1)) env.close()
def train(env_id, num_timesteps, seed, policy, n_envs=8, nminibatches=4, n_steps=128): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) :param n_envs: (int) Number of parallel environments :param nminibatches: (int) Number of training minibatches per update. For recurrent policies, the number of environments run in parallel should be a multiple of nminibatches. :param n_steps: (int) The number of steps to run for each environment per update (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel) """ env = make_atari_env(env_id, n_envs, seed) env = VecFrameStack(env, 4) policy = {'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy}[policy] model = PPO2(policy=policy, env=env, n_steps=n_steps, nminibatches=nminibatches, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) model.learn(total_timesteps=num_timesteps) model.save('/serverdata/rohit/stablebaselines/atari/ppo/{}'.format(env_id), 'csv') env.close() # Free memory del model
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_cpu): """ train an ACER model on atari :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') :param num_cpu: (int) The number of cpu to train on """ env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = CnnLstmPolicy else: warnings.warn("Policy {} not implemented".format(policy)) return model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000) model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) env.close() # Free memory del model
def test_pretrain_images(tmp_path): env = make_atari_env("PongNoFrameskip-v4", num_env=1, seed=0) env = VecFrameStack(env, n_stack=4) model = PPO2('CnnPolicy', env) generate_expert_traj(model, str(tmp_path / 'expert_pong'), n_timesteps=0, n_episodes=1, image_folder=str(tmp_path / 'pretrain_recorded_images')) expert_path = str(tmp_path / 'expert_pong.npz') dataset = ExpertDataset(expert_path=expert_path, traj_limitation=1, batch_size=32, sequential_preprocessing=True) model.pretrain(dataset, n_epochs=2) shutil.rmtree(str(tmp_path / 'pretrain_recorded_images')) env.close() del dataset, model, env
def test_ppo(env_id, seed, path_to_policy_params, n_envs = 1): """ env_id: typr str, identifies each environment uniquely num_timesteps: number of timesteps to run the algorithm seed: initial random seed policy: policy to be followed (mlp, cnn, lstm, etc) n_env: number of envs to run in parallel nminibatches: number of minibatches of mini batch gradient descent (first-order optimization) to update the policy params n_steps: number of steps in each update """ # Train PPO algorithm for num_timesteps # stack 4 frames for the vectorized environment # Note: PPO2 works only with vectorized environment env = VecFrameStack(make_atari_env(env_id = env_id, num_env = n_envs, seed=seed), 4) # define the policy # create model object for class PPO2 # The policy is CnnPolicy from stable baselines and has been trained for 2e7 time steps on Pong model = PPO2.load(path_to_policy_params) vr = video_recorder.VideoRecorder(env, base_path="./videos/Pong_test_without_attack", enabled="./videos/Pong_test_without_attack" is not None) obs = env.reset() ep_rew = [0.0] ep = 0 for i in range(50000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) ep_rew[-1] += rewards env.render() vr.capture_frame() if dones: obs = env.reset() print('Net reward for episode ',ep,': ',ep_rew[-1]) if((ep+1)%10 == 0): print('Mean reward for last 10 episodes: ',np.mean(ep_rew[-10:])) ep_rew.append(0.0) ep += 1 print('Number of timesteps completed: ', i+1) env.close() vr.close()
def train( train_id, game, level, num_processes, num_timesteps, algo_name, policy_name, is_joint, model_save_path, logs_path, hyper_opt, load_model_path=None, train_counter=0, # To be set (incrementally) when running multiple trainings short_life=False, backtracking=False, ): global global_logs_path, best_mean_reward, n_steps print("\n\nStarting training with args:\n") print(log_fun_args(locals())) print("\n") global_logs_path = logs_path best_mean_reward, n_steps = -np.inf, 0 envs = [] if is_joint: envs = [ make_env( game=game, level=level, rank=i, log_dir=logs_path, seed=train_counter * 100, short_life=short_life, backtracking=backtracking, ) for i, (game, level) in enumerate(small_train_set) ] else: envs = [ make_env( game=game, level=level, rank=i, log_dir=logs_path, seed=train_counter * 100, short_life=short_life, backtracking=backtracking, ) for i in range(num_processes) ] if num_processes == 1: env = VecFrameStack(DummyVecEnv(envs), 4) else: env = VecFrameStack(SubprocVecEnv(envs), 4) print("\n\n") algo = None if algo_name == "ppo2": algo = PPO2 elif algo_name == "a2c": algo = A2C policy = None nminibatches = 4 if policy_name == "cnn": policy = CnnPolicy elif policy_name == "cnnlstm": if is_joint: nminibatches = 5 policy = CnnLstmPolicy model = None if load_model_path: print("Loading a model...") model = algo.load(load_model_path, env=env, tensorboard_log=logs_path) else: print("Creating a new model...") if algo_name == "ppo2": if hyper_opt: model = algo( policy, env, verbose=1, tensorboard_log=logs_path, n_steps=4096, nminibatches=8, learning_rate=2e-4, ent_coef=0.01, ) else: model = PPO2( policy, env, nminibatches=nminibatches, verbose=1, tensorboard_log=logs_path, ) elif algo_name == "a2c": model = A2C(policy, env, verbose=1, tensorboard_log=logs_path) print(f"Starting training for {num_timesteps} timesteps") model.learn(total_timesteps=num_timesteps, callback=callback, log_interval=1) print("Training finished!") if model_save_path: model.save(model_save_path) print("Model saved in:\t", model_save_path) timestep_values, score_values = ts2xy(load_results(logs_path), "timesteps") score_values = score_values * 100 plot_path = os.path.join(logs_path, f"{level}.png") print("Saving the plot in: " + plot_path) save_plot(timestep_values, score_values, title=level, save_path=plot_path) env.close()
num_env=config['parallel_envs'], seed=config['seed']) env = VecFrameStack(env, n_stack=config['state_frames']) env = VecVideoRecorder(env, video_folder, record_video_trigger=lambda x: x == 0, video_length=video_length, name_prefix="random-agent-{}".format(config['task'])) obs = env.reset() n = env.action_space.n device = torch.device('cuda') if config['use_gpu'] else torch.device('cpu') model = ActorCritic(n, config).to(device) model.load_state_dict( torch.load('checkpoints\pong_noEnt\model_recent_ckpt', map_location=device)) model.eval() for i in tqdm(range(video_length)): #env.render(mode='rgb_array') tensor = torch.from_numpy(obs.astype(np.float32).transpose( (0, 3, 1, 2))) / 255 tensor = torch.nn.functional.interpolate(tensor, scale_factor=48 / tensor.shape[-1]) action, _, _, _ = model.forward(tensor.to(device)) obs, _, dones, _ = env.step(action) if dones.sum() > 0: obs = env.reset() env.close()
def record_(): model_path = args.load_model os.path.isfile(model_path) # search skills m = re.search("\[[0-9\, \[\]]*\]", model_path) if m is None: raise ValueError( "load_model: {} does not contain skills".format(model_path)) skills = str_to_skills(m.group(0)) # search env-id env_id_list = ENV_LIST env_id = None searched = False m = re.search("[A-Z][a-z]*NoFrameskip-v4", model_path) if m is not None: searched = True env_id = m.group(0) if searched is not True: for id_ in env_id_list: if id_.lower() in model_path.lower(): searched = True env_id = id_ + "NoFrameskip-v4" if searched is not True: raise ValueError( "load_model: {} does not contain env id".format(model_path)) save_path = args.logdir if save_path is None: save_path = os.path.dirname(model_path) print("ENV:{} \nskills:{} \nmodel_path:{} \nsave_path:{}\n".format( env_id, skills, model_path, save_path)) time.sleep(3) env_creator_ = lambda env: ActionRemapWrapper(env) env_creator = lambda env: SkillWrapper(env_creator_(env), skills=skills) env = VecFrameStack( make_atari_env(env_id, 1, args.seed, extra_wrapper_func=env_creator, logdir=save_path, wrapper_kwargs={ "episode_life": False, "clip_rewards": False }), 4) if args.load_model is None: raise NotImplementedError assert os.path.isfile(args.load_model) if args.rl_model == "ppo": model = PPO2.load(args.load_model) elif args.rl_model == "a2c": model = A2C.load(args.load_model) elif args.rl_model is None: if "ppo" in model_path: model = PPO2.load(model_path) elif "a2c" in model_path: model = A2C.load(model_path) else: raise ValueError("please specify rl_model") else: raise ValueError("{} rl_model not recognize".format(args.rl_model)) # DEBUG set_global_seeds(args.seed) obs = env.reset() if args.record: env = VecVideoRecorder(env, save_path, record_video_trigger=lambda x: x == 0, video_length=MAX_VIDEO_LENGTH) env.reset() total_rewards = 0 action_save_path = os.path.join(save_path, "history_action.txt") if args.log_action: try: os.remove(action_save_path) except OSError as e: if e.errno != errno.ENOENT: # errno.ENOENT = no such file or directory raise # re-raise exception if a different error occurred log_picture = None if args.log_picture: log_picture = os.path.join(save_path, "history_action_pic") log_picture = mkdirs(log_picture, mode="keep") action_save_path = os.path.join(log_picture, os.path.basename(action_save_path)) # try: # # shutil.rmtree() # except: print("start evaluate") with open(action_save_path, 'a') as f: for steps in range(args.eval_max_steps): action, _states = model.predict(obs) if args.log_action: # print("{}".format(action[0]), sep=" ", file=f) f.write("{} ".format(action[0])) if args.log_picture: assert log_picture is not None pict = env.render(mode='rgb_array') im = Image.fromarray(pict) _path = os.path.join(log_picture, "{}_{}.jpg".format(steps, action[0])) im.save(_path) obs, rewards, dones, info = env.step(action) total_rewards += rewards if bool(dones[0]) is True: break print("steps: {}/{}".format(steps + 1, args.eval_max_steps)) print("total_rewards: {}".format(total_rewards)) env.close()
def get_rewards(self, skills=[], train_total_timesteps=5000000, eval_times=100, eval_max_steps=int(1e6), model_save_name=None, add_info={}): """ :param skills: (list) the availiable action sequence for agent e.g [[0,2,2],[0,1,1]] :param train_total_timesteps: (int)total_timesteps to train :param eval_times: (int)the evaluation times e.g eval_times=100, evalulate the policy by averageing the reward of 100 episode :param eval_max_steps: (int)maximum timesteps per episode when evaluate (deprecate):param model_save_name: (str)specify the name of saved model (should not repeat) :param add_info: (dict) other information to log in log.txt """ if self.save_tensorboard and self.save_path is not None: tensorboard_log = os.path.join(self.save_path, "model_" + str(self._serial_num)) else: tensorboard_log = None env_creator = lambda env: SkillWrapper( self.env_creator(env), skills=skills, gamma=self.gamma) if self.save_monitor is True: monitor_path = os.path.join(self.save_path, "monitor") try: os.makedirs(monitor_path) except OSError as ex: if ex.errno == errno.EEXIST and os.path.exists(monitor_path): print("{} exists. ignore".format(monitor_path)) pass else: raise else: monitor_path = None if "cfg" in self.env_id: env = make_doom_env(self.env_id, self.num_cpu, self.seed, extra_wrapper_func=env_creator, logdir=monitor_path) else: env = VecFrameStack( make_atari_env(self.env_id, self.num_cpu, self.seed, extra_wrapper_func=env_creator, logdir=monitor_path), 4) model = None if self.use_converge_parameter is True: model = self.model(self.policy, env, verbose=self.verbose, tensorboard_log=tensorboard_log, n_steps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1) else: model = self.model(self.policy, env, verbose=self.verbose, tensorboard_log=tensorboard_log) self.strat_time = time.time() print("start to train agent...") callback = None if self.evaluate_freq is not None and self.evaluate_freq > 0: preiod_eval_path = os.path.join(self.save_path, "period_eval") mkdirs(preiod_eval_path) if "cfg" in self.env_id: eval_env = make_doom_env(self.env_id, self.num_cpu, self.seed, extra_wrapper_func=env_creator, logdir=monitor_path, wrapper_kwargs={ "episode_life": False, "clip_rewards": False }) else: eval_env = VecFrameStack( make_atari_env(self.env_id, self.num_cpu, self.seed, extra_wrapper_func=env_creator, logdir=preiod_eval_path, wrapper_kwargs={ "episode_life": False, "clip_rewards": False }), 4) callback = self.eval_callback(eval_env, freq=self.evaluate_freq, eval_times=eval_times, eval_max_steps=eval_max_steps, save_path=preiod_eval_path) model.learn(total_timesteps=train_total_timesteps, reset_num_timesteps=self.reset_num_timesteps, callback=callback) print("Finish train agent") #evaluate once more because sometimes it is not divisible if callback is not None: callback({"self": model, "eval_now": True}, None) if self.save_path is not None: if self.preserve_model > 0: self.save_model(model, skills=skills) env.close() # evaluate env = VecFrameStack( make_atari_env(self.env_id, self.num_cpu, self.seed, extra_wrapper_func=env_creator, logdir=None), 4) info = self.evaluate(env, model, eval_times, eval_max_steps) try: env.close() except AttributeError as e: print("Ignore : {}".format(e)) try: del model except AttributeError as e: print("Ignore del model : {}".format(e)) #log result info.update(add_info) self.log(info) self._serial_num = self._serial_num + 1 return info["ave_score"], info["ave_action_reward"]