def run_experiment(exp_num, exp_type, variants, n_cpu, step_total, exp_log, log_dict, drive, og_dir): model_names = [] run_path = '' for order, variant in enumerate(variants): alter_env(exp_type, variant) env = gym.make("Real-v0") env = Monitor(env, 'tf_save', allow_early_resets=True) env = SubprocVecEnv([lambda: env for i in range(n_cpu)]) if order == 0: model = PPO2(MlpPolicy, env, verbose=0, tensorboard_log="./tensorboard_log/", drive=drive, og_dir=og_dir) else: pydrive_util.download_file(drive, run_path + '/checkpoint') load_name = load_checkpoint(-1, run_path) pydrive_util.download_file(drive, load_name) model = PPO2.load('tmp/tmp_file', env=env, drive=drive, og_dir=og_dir) model_names.append(model.model_name) run_path = model.graph_dir model.learn(total_timesteps=step_total) pydrive_util.upload_file(drive, model.checkpoint_log) env.close() del model, env log_experiments(exp_num, exp_type, variants, model_names, exp_log, log_dict, drive)
class SbSac(): '''stable baselines SAC''' def __init__(self, expt_name): rospack = rospkg.RosPack() pkg_path = rospack.get_path('deepleng_control') outdir = pkg_path + '/monitor_logs/' + expt_name # env = gym.make('LunarLanderContinuous-v2') env = gym.make('DeeplengDocking-v2') self.expt_name = expt_name self.env = Monitor(env, outdir) def __call__(self): policy_kwargs = dict(layers=[400, 300, 200, 100]) # check_env(self.env) model = TD3(MlpPolicy, self.env, policy_kwargs=policy_kwargs, tensorboard_log= "/home/dfki.uni-bremen.de/mpatil/Documents/baselines_log", verbose=1) time_steps = 3e4 model.learn(total_timesteps=int(time_steps), log_interval=50, tb_log_name="sac_Docker_" + self.expt_name) model.save( "/home/dfki.uni-bremen.de/mpatil/Documents/sac_stable_baselines_" + self.expt_name) print("Closing environment") self.env.close()
def main(log_dir=None, name_results_root_folder="results"): args = parseArgs() time_steps = TIME_STEPS # if log_dir doesnt created,use defaul one which contains the starting time of the training. if log_dir is None: if args.restart_training: # find the latest training folder latest_log_dir = os.path.join( name_results_root_folder, sorted(os.listdir(name_results_root_folder))[-1]) logdir = latest_log_dir else: defaul_log_dir = os.path.join(name_results_root_folder, "DQN_" + getTimeStr()) os.makedirs(defaul_log_dir, exist_ok=True) logdir = defaul_log_dir else: logdir = log_dir reward_bound = REWARD_BOUND # get arena environments and custom callback env = Monitor(Arena2dEnvWrapper(0, True), os.path.join(logdir, "arena_env0")) # env = Arena2dEnvWrapper(0, True) call_back = SaveOnBestTrainingRewardCallback(500, logdir, 1, reward_bound) # set temporary model path, if training was interrupted by the keyboard, the current model parameters will be saved. path_temp_model = os.path.join(logdir, "DQN_TEMP") if not args.restart_training: model = DQN(MlpPolicy, env, gamma=GAMMA, learning_rate=LEARNING_RATE, buffer_size=BUFFER_SIZE, target_network_update_freq=SYNC_TARGET_STEPS, tensorboard_log=logdir, verbose=1) reset_num_timesteps = True else: if os.path.exists(path_temp_model + ".zip"): print("continue training the model...") model = DQN.load(path_temp_model, env=env) reset_num_timesteps = False else: print( "Can't load the model with the path: {}, please check again!". format(path_temp_model)) env.close() exit(-1) # try: model.learn(time_steps, log_interval=200, callback=call_back, reset_num_timesteps=reset_num_timesteps) model.save(os.path.join(logdir, "DQN_final"))
def train_ppo(env_id, num_timesteps, seed, policy, save_params, n_envs=1, nminibatches=5, n_steps=8000): """ env_id: typr str, identifies each environment uniquely num_timesteps: number of timesteps to run the algorithm seed: initial random seed policy: policy to be followed (mlp, cnn, lstm, etc) n_env: number of envs to run in parallel nminibatches: number of minibatches of mini batch gradient descent (first-order optimization) to update the policy params n_steps: number of steps in each update """ # Train PPO algorithm for num_timesteps # stack the frames for the vectorized environment # Note: PPO2 works only with vectorized environment set_global_seeds(seed) env = make_atari(env_id) env.seed(seed) env = Monitor(env, log_dir, allow_early_resets=True) env = wrap_deepmind(env, frame_stack=True) # define the policy policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] # create model object for class PPO2 model = PPO2(policy=policy, env=env, n_steps=n_steps, nminibatches=nminibatches, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) # train the model # trained for 2e7 timesteps with seed = 5 model.learn(total_timesteps=num_timesteps, callback=callback) # save the hyperparameters and weights model.save(save_params) env.close() # free the memory del model
def ppo1(env_id, log_dir, timesteps): # Create log dir os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = gym.make(env_id) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) model = PPO1(MlpPolicy, env, verbose=0) # Train the agent print("Beginning training episodes with PPO1.") model.learn(total_timesteps=timesteps) env.close()
def train_dqn_adv(env_id, train_timesteps, seed, policy, save_params, n_envs = 1): set_global_seeds(seed) env = make_atari(env_id) env.seed(seed) env = Monitor(env, log_dir, allow_early_resets=True) env = wrap_deepmind(env, frame_stack=True) # define the policy policy = {'cnn': CnnPolicy, 'mlp': MlpPolicy}[policy] # create model object for class DQN model = DQN(policy = policy, env = env, gamma=0.99, learning_rate=0.0001, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, exploration_initial_eps=1.0, train_freq=4, batch_size=32, double_q=True, learning_starts=10000, target_network_update_freq=1000, prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-06, param_noise=False, n_cpu_tf_sess=None, verbose=1) callback = save_best_model_callback(save_freq = 100, log_dir = log_dir, save_params = save_params, verbose=1) # train the model # trained for 2e7 timesteps with seed = 7 model.learn(total_timesteps = train_timesteps, callback = callback) plot_results([log_dir], train_timesteps, results_plotter.X_TIMESTEPS, "DQNPong_TrainedByAdversary") plt.show() env.close() # free the memory del model
class SbTd3(): '''stable baselines TD3''' def __init__(self, expt_name): rospack = rospkg.RosPack() pkg_path = rospack.get_path('deepleng_control') outdir = pkg_path + '/monitor_logs/' + expt_name # env = gym.make('LunarLanderContinuous-v2') env = gym.make('DeeplengDocking-v2') self.expt_name = expt_name self.env = Monitor(env, outdir) def __call__(self): policy_kwargs = dict(layers=[400, 300, 200, 100]) n_actions = self.env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) # check_env(self.env) model = TD3(MlpPolicy, self.env, policy_kwargs=policy_kwargs, action_noise=action_noise, memory_limit=50000, tensorboard_log= "/home/dfki.uni-bremen.de/mpatil/Documents/baselines_log", verbose=1) time_steps = 3e4 model.learn(total_timesteps=int(time_steps), log_interval=50, tb_log_name="td3_Docker_" + self.expt_name) model.save( "/home/dfki.uni-bremen.de/mpatil/Documents/td3_stable_baselines_" + self.expt_name) print("Closing environment") self.env.close()
def hardcode(env_id, log_dir, timesteps): # Create log dir os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = gym.make(env_id) env = Monitor(env, log_dir, allow_early_resets=True) print("Running episodes with hardcoded policy.") inc = 0 done = False while inc < timesteps: obs = env.reset() while True: action = policy(obs) obs, _, done, _ = env.step(action) inc += 1 if done: break env.close()
def random_agent(env_id, log_dir, timesteps): # Create log dir os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = gym.make(env_id) env = Monitor(env, log_dir, allow_early_resets=True) print("Running episodes with random policy.") # initalize timestep counter inc = 0 while inc < timesteps: obs = env.reset() while True: # choose a random action from action_space action = env.action_space.sample() obs, _, done, _ = env.step(action) inc += 1 if done: break env.close()
class SbPpo2(): '''stable baselines PPO2''' def __init__(self, expt_name): rospack = rospkg.RosPack() pkg_path = rospack.get_path('deepleng_control') outdir = pkg_path + '/monitor_logs/' + expt_name # env = gym.make('LunarLanderContinuous-v2') env = gym.make('DeeplengDocking-v2') self.expt_name = expt_name self.env = Monitor(env, outdir) def __call__(self, *args, **kwargs): # eval_callback = EvalCallback(env, best_model_save_path=eval_dir, # log_path=eval_dir, eval_freq=500, # deterministic=True, render=False) policy_kwargs = dict(layers=[400, 300, 200, 100]) model = PPO2(MlpPolicy, self.env, policy_kwargs=policy_kwargs, verbose=1, tensorboard_log= "home/dfki.uni-bremen.de/mpatil/Documents/baselines_log") model.learn(total_timesteps=int(1e5), log_interval=50, tb_log_name="ppo_Docker_" + self.expt_name) model.save( "/home/dfki.uni-bremen.de/mpatil/Documents/ppo_stable_baselines_" + self.expt_name) # del model print("Closing environment") self.env.close()
logger.configure(folder=LOGDIR) env = gym.make("SlimeVolley-v0") env = Monitor(env, LOGDIR, allow_early_resets=True) env.seed(n) model = PPO1(BnnPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2) eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) model.save(os.path.join(LOGDIR, "final_model")) env.close()
def main(): global save_path, log_dir, model, best_mean_reward mk_dir(args.checkpoint_dir + args.policy) save_path = args.checkpoint_dir + args.policy + "/" + args.policy log_dir = args.summary_dir + args.policy mk_dir(log_dir) env = gym.make("SegmentationEnv-v0", objs_dir=args.objs_dir, max_scenes=args.max_scenes, sample_size=args.sample_size, diff_punishment=args.diff_punishment, max_steps_per_scene=args.max_steps_per_scene, scene_mode=args.scene_mode, point_mode=args.point_mode, voxel_size=args.voxel_size, voxel_mode=args.voxel_mode, single_scenes=args.single_scenes, early_diff=args.early_diff, wall_weight=args.wall_weight) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([ lambda: env ]) # The algorithms require a vectorized environment to run env = VecCheckNan(env, raise_exception=True) net_module = importlib.import_module(args.policy) model = PPO2(net_module.Policy, env, verbose=args.verbose, tensorboard_log=log_dir, learning_rate=args.learning_rate, ent_coef=args.ent_coef, cliprange=args.cliprange, cliprange_vf=args.cliprange_vf, lam=args.lam, gamma=args.gamma, seed=args.seed, n_cpu_tf_sess=args.n_cpu_tf_sess, noptepochs=args.noptepochs, nminibatches=args.nminibatches, n_steps=args.n_steps, max_grad_norm=args.max_grad_norm) if os.path.isfile("expert_trajectories.npz") and args.pretrain == 1: print("------------start pretrain------------") #dataset = ExpertDataset(expert_path="expert_trajectories.npz", special_shape=True, traj_limitation=100, batch_size=16) dataset = ExpertDataset(expert_path="expert_trajectories.npz", special_shape=True, train_fraction=args.train_fraction, batch_size=args.pretrain_batch_size) #model.pretrain(dataset, learning_rate=0.001, n_epochs=1000) model = model.pretrain(dataset, val_interval=1, learning_rate=args.pretrain_learning_rate, n_epochs=args.pretrain_n_epochs) print("pretrain finished -- save model") model.save(save_path) returns = [] print("Calculate mean reward") n_episodes = 10 for i in range(n_episodes): total_reward = 0 obs = env.reset() while True: action, _states = model.predict(obs, deterministic=True) obs, reward, done, info = env.step(action) total_reward += reward if done: returns.append(total_reward) break returns = np.array(returns) best_mean_reward = np.mean(returns) print("Best mean reward: {:.2f}".format(best_mean_reward)) model.learn(total_timesteps=args.total_timesteps, callback=callback) env.close()