def train(): env = CBNEnv.create(1) n_steps = int(1024 / env.num_envs) logdir, logger = setup_logger() env.set_attr("logger", logger) log_callback = lambda locals, globals: env.env_method("log_callback") policy_kwargs = dict(n_lstm=192, layers=[], feature_extraction="mlp") model = A2C(policy=LstmPolicy, env=env, alpha=0.95, gamma=0.93, n_steps=1024, vf_coef=0.05, ent_coef=0.25, learning_rate=1e-4, max_grad_norm=10000, lr_schedule='linear', policy_kwargs=policy_kwargs, tensorboard_log=logdir, verbose=1, seed=94566) model.learn(total_timesteps=int(1e7), callback=log_callback)
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env): """ Train A2C model for atari environment, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') :param num_env: (int) The number of environments """ policy_fn = None if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = CnnLstmPolicy elif policy == 'lnlstm': policy_fn = CnnLnLstmPolicy if policy_fn is None: raise ValueError("Error: policy {} not implemented".format(policy)) env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) model = A2C(policy_fn, env, lr_schedule=lr_schedule) model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) env.close()
def setup(model_params, output_folder_path): latest_model_path = find_latest_model(Path(output_folder_path)) if latest_model_path is None: print("Creating model...") model = A2C(CnnPolicy, **model_params) else: print("Loading model...") model = A2C.load(latest_model_path, **model_params) tensorboard_dir = (output_folder_path / "tensorboard") ckpt_dir = (output_folder_path / "checkpoints") tensorboard_dir.mkdir(parents=True, exist_ok=True) ckpt_dir.mkdir(parents=True, exist_ok=True) checkpoint_callback = CheckpointCallback(save_freq=200, verbose=2, save_path=ckpt_dir.as_posix()) # event_callback = EveryNTimesteps(n_steps=100, callback=checkpoint_callback) logging_callback = CustomCallback(model=model, verbose=1) callbacks = CallbackList([checkpoint_callback, logging_callback]) return model, callbacks
def main(env, load, save_path, load_path=None, train_timesteps=1.25e6, eval_timesteps=5e3): # arguments print( "env %s; load %s; save_path %s; load_path %s; train_timesteps %s; eval_timesteps %s;" % (env, load, save_path, load_path, train_timesteps, eval_timesteps)) train_timesteps = int(float(train_timesteps)) eval_timesteps = int(float(eval_timesteps)) # models path model_dir = os.getcwd() + "/models/" os.makedirs(model_dir, exist_ok=True) # logging path log_dir = os.getcwd() + "/log/" + save_path os.makedirs(log_dir, exist_ok=True) # absolute save path and models path save_path = model_dir + save_path if load and not load_path: print("no load path given, exiting...") sys.exit() elif load: load_path = model_dir + load_path # make environment, flattened environment, monitor, vectorized environment env = gym.make(env) env = gym.wrappers.FlattenDictWrapper( env, ['observation', 'achieved_goal', 'desired_goal']) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) # load model, or start from scratch if load: print("loading model from: " + load_path) model = A2C.load(load_path, env=env) else: print("training model from scratch") model = A2C(MlpPolicy, env, verbose=1) # evaluate current model mean_reward_before_train = evaluate(model, env, num_steps=eval_timesteps) # train model global best_mean_reward, n_steps best_mean_reward, n_steps = -np.inf, 0 model.learn(total_timesteps=train_timesteps, callback=None) # save model print("saving model to:" + save_path) model.save(save_path) # evaluate post training model mean_reward_after_train = evaluate(model, env, num_steps=eval_timesteps) # results print("reward before training:" + str(mean_reward_before_train)) print("reward after training:" + str(mean_reward_after_train)) print("done")
from stable_baselines.common.vec_env import DummyVecEnv # from stable_baselines.ppo2 import PPO2 from stable_baselines.a2c import A2C # import gym_fin logging.basicConfig(stream=sys.stdout) logger = logging.getLogger(__name__) env = gym.make('gym_fin:Pension-v0') # vectorized environments allow to easily multiprocess training env = DummyVecEnv([lambda: env]) # The algorithms require a vectorized environment to run # model = PPO2(MlpPolicy, env, verbose=0) model = A2C(MlpPolicy, env, verbose=0) def evaluate(model, num_steps=1000): """Evaluate a RL agent :param model: (BaseRLModel object) the RL Agent :param num_steps: (int) number of timesteps to evaluate it :return: (float) Mean reward for the last 100 episodes """ episode_rewards = [0.0] obs = env.reset() for i in range(num_steps): # _states are only useful when using LSTM policies action, _states = model.predict(obs) # here, action, rewards and dones are arrays # because we are using vectorized env
from stable_baselines.a2c import A2C from stable_baselines.acer import ACER from stable_baselines.acktr import ACKTR from stable_baselines.deepq import DeepQ from stable_baselines.ppo1 import PPO1 from stable_baselines.ppo2 import PPO2 from stable_baselines.trpo_mpi import TRPO from stable_baselines.common.identity_env import IdentityEnv from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv from stable_baselines.common.policies import MlpPolicy from stable_baselines.deepq import models as deepq_models learn_func_list = [ lambda e: A2C( policy=MlpPolicy, learning_rate=1e-3, n_steps=1, gamma=0.7, env=e). learn(total_timesteps=10000, seed=0), lambda e: ACER(policy=MlpPolicy, env=e, n_steps=1, replay_ratio=1).learn( total_timesteps=10000, seed=0), lambda e: ACKTR(policy=MlpPolicy, env=e, learning_rate=5e-4, n_steps=1 ).learn(total_timesteps=20000, seed=0), lambda e: DeepQ(policy=deepq_models.mlp([32]), batch_size=16, gamma=0.1, exploration_fraction=0.001, env=e).learn(total_timesteps=40000, seed=0), lambda e: PPO1(policy=MlpPolicy, env=e, lam=0.7, optim_batchsize=16, optim_stepsize=1e-3).learn(total_timesteps=10000, seed=0),
def main(env, load_path, fig_path): # arguments print("env %s; load_path %s; fig_path %s;" % (env, load_path, fig_path)) log_path = os.getcwd() + "/log/" + load_path os.makedirs(os.getcwd() + "/figs/" + "/", exist_ok=True) fig_path = os.getcwd() + "/figs/" + "/" + fig_path load_path = os.getcwd() + "/models/" + load_path # make environment, flattened environment, vectorized environment env = gym.make(env) env = gym.wrappers.FlattenDictWrapper(env, ['observation', 'achieved_goal', 'desired_goal']) env = DummyVecEnv([lambda: env]) # load model model = A2C.load(load_path, env=env) obs_initial = env.reset() obs = obs_initial # plot results plot_results(fig_path, log_path) # initializations niter = 10 counter = 0 timestep = 0 results = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]] current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]] print("==============================") # check initial positions and quaternions print("grip", env.envs[0].env.env.sim.data.get_site_xpos('grip')) print("box", env.envs[0].env.env.sim.data.get_site_xpos('box')) print("tool", env.envs[0].env.env.sim.data.get_site_xpos('tool')) print("mocap", env.envs[0].env.env.sim.data.mocap_pos) print("quat", env.envs[0].env.env.sim.data.mocap_quat) print("==============================") # mocap quaternion check for i in range(5): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) quat = env.envs[0].env.env.sim.data.mocap_quat print("obs", obs) print("quat", quat) print("==============================") # start rendering dists = [] box_goal_pos = np.array([0.6, 0.05, -0.17]) while True: if counter == niter: break action, _states = model.predict(obs) obs_old = obs obs, rewards, dones, info = env.step(action) quaternion = env.envs[0].env.env.sim.data.mocap_quat if obs.all() == obs_initial.all(): if counter % 10 == 0: xyzs = current[0] quats = current[1] print(xyzs) print(quats) filename = log_path + "/" + "results_" + str(counter) + ".txt" os.makedirs(log_path + "/", exist_ok=True) file = open(filename, 'w+') for xyz, quat in zip(xyzs, quats): for coord in xyz: file.write(str(coord) + " ") for quat_coord in quat: file.write(str(quat_coord) + " ") file.write("\n") file.close() box_end_pos = np.array(obs_old[0][3:6].tolist()) print(box_end_pos) print(np.shape(box_end_pos)) print(box_goal_pos) print(np.shape(box_goal_pos)) dists.append(np.linalg.norm(box_goal_pos - box_end_pos)) current = [[[0,0,0] for i in range(100)], [[0,0,0,0] for i in range(100)]] timestep = 0 counter += 1 print(timestep) print("obs", obs) print("quat", quaternion) # for average trajectory, smoothed for i in range(3): results[0][timestep][i] += obs[0][:3].tolist()[i] for j in range(4): results[1][timestep][j] += quaternion[0].tolist()[j] # for current trajectory for i in range(3): current[0][timestep][i] += obs[0][:3].tolist()[i] for j in range(4): current[1][timestep][j] += quaternion[0].tolist()[j] timestep += 1 env.render() # smooth paths by taking average, and calculate mean distance to goal state for timestep in range(100): for i in range(3): results[0][timeste][i] /= niter for j in range(4): results[0][timestep][j] /= niter dist = np.mean(dists) # print and write to file xyzs = results[0] quats = results[1] filename = log_path + "/" + "results_avg.txt" os.makedirs(log_path + "/", exist_ok=True) file = open(filename, 'w+') for xyz, quat in zip(xyzs, quats): for coord in xyz: file.write(str(coord) + " ") for quat_coord in quat: file.write(str(quat_coord) + " ") file.write("\n") file.close() # print average distances print("average distance of box from end goal: %f" % dist)
:param rank: (int) index of the subprocess :param board: (numpy array) pre-determined board for env. """ if board is not None: def _init(): env = gym.make(env_id) env.seed(seed + rank) env.reset_task(board) return env else: def _init(): env = gym.make(env_id) env.seed(seed + rank) return env set_global_seeds(seed) return _init if __name__=='__main__': num_episodes=int(sys.argv[1]) env=make_vec_env('grid-v0',n_envs=1,vec_env_cls=SubprocVecEnv) policy_kwargs={'cnn_extractor':mlp,'n_lstm':n_lstm} #policy_kwargs={'cnn_extractor':cnn,'n_lstm':n_lstm} model=A2C(policy='CnnLstmPolicy',policy_kwargs=policy_kwargs,tensorboard_log='test_log',env=env,gamma=gamma, n_steps=n_steps,lr_schedule=lr_schedule,learning_rate=lr,ent_coef=ent_coef,vf_coef=vf_coef,verbose=True) model.learn(num_episodes) model.save("7x7_"+rules+"_"+"metalearning.zip")
import pytest from stable_baselines.a2c import A2C from stable_baselines.ppo1 import PPO1 from stable_baselines.ppo2 import PPO2 from stable_baselines.trpo_mpi import TRPO from stable_baselines.common.identity_env import IdentityEnvMultiBinary, IdentityEnvMultiDiscrete from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv from stable_baselines.common.policies import MlpPolicy MODEL_FUNC_LIST = [ lambda e: A2C(policy=MlpPolicy, env=e), lambda e: PPO1(policy=MlpPolicy, env=e), lambda e: PPO2(policy=MlpPolicy, env=e), lambda e: TRPO(policy=MlpPolicy, env=e), ] @pytest.mark.slow @pytest.mark.parametrize("model_func", MODEL_FUNC_LIST) def test_identity_multidiscrete(model_func): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) with a multidiscrete action space :param model_func: (lambda (Gym Environment): BaseRLModel) the model generator """ env = DummyVecEnv([lambda: IdentityEnvMultiDiscrete(10)]) model = model_func(env)