예제 #1
0
def run_experiment(exp_num, exp_type, variants, n_cpu, step_total, exp_log,
                   log_dict, drive, og_dir):
    model_names = []
    run_path = ''
    for order, variant in enumerate(variants):
        alter_env(exp_type, variant)
        env = gym.make("Real-v0")
        env = Monitor(env, 'tf_save', allow_early_resets=True)
        env = SubprocVecEnv([lambda: env for i in range(n_cpu)])
        if order == 0:
            model = PPO2(MlpPolicy,
                         env,
                         verbose=0,
                         tensorboard_log="./tensorboard_log/",
                         drive=drive,
                         og_dir=og_dir)
        else:
            pydrive_util.download_file(drive, run_path + '/checkpoint')
            load_name = load_checkpoint(-1, run_path)
            pydrive_util.download_file(drive, load_name)
            model = PPO2.load('tmp/tmp_file',
                              env=env,
                              drive=drive,
                              og_dir=og_dir)
        model_names.append(model.model_name)
        run_path = model.graph_dir
        model.learn(total_timesteps=step_total)
        pydrive_util.upload_file(drive, model.checkpoint_log)
        env.close()
        del model, env
    log_experiments(exp_num, exp_type, variants, model_names, exp_log,
                    log_dict, drive)
class SbSac():
    '''stable baselines SAC'''
    def __init__(self, expt_name):
        rospack = rospkg.RosPack()
        pkg_path = rospack.get_path('deepleng_control')
        outdir = pkg_path + '/monitor_logs/' + expt_name

        # env = gym.make('LunarLanderContinuous-v2')
        env = gym.make('DeeplengDocking-v2')
        self.expt_name = expt_name
        self.env = Monitor(env, outdir)

    def __call__(self):

        policy_kwargs = dict(layers=[400, 300, 200, 100])

        # check_env(self.env)
        model = TD3(MlpPolicy,
                    self.env,
                    policy_kwargs=policy_kwargs,
                    tensorboard_log=
                    "/home/dfki.uni-bremen.de/mpatil/Documents/baselines_log",
                    verbose=1)

        time_steps = 3e4
        model.learn(total_timesteps=int(time_steps),
                    log_interval=50,
                    tb_log_name="sac_Docker_" + self.expt_name)
        model.save(
            "/home/dfki.uni-bremen.de/mpatil/Documents/sac_stable_baselines_" +
            self.expt_name)

        print("Closing environment")
        self.env.close()
예제 #3
0
def main(log_dir=None, name_results_root_folder="results"):
    args = parseArgs()
    time_steps = TIME_STEPS
    # if log_dir doesnt created,use defaul one which contains the starting time of the training.
    if log_dir is None:
        if args.restart_training:
            # find the latest training folder
            latest_log_dir = os.path.join(
                name_results_root_folder,
                sorted(os.listdir(name_results_root_folder))[-1])
            logdir = latest_log_dir
        else:
            defaul_log_dir = os.path.join(name_results_root_folder,
                                          "DQN_" + getTimeStr())
            os.makedirs(defaul_log_dir, exist_ok=True)
            logdir = defaul_log_dir
    else:
        logdir = log_dir
    reward_bound = REWARD_BOUND
    # get arena environments and custom callback
    env = Monitor(Arena2dEnvWrapper(0, True),
                  os.path.join(logdir, "arena_env0"))
    # env = Arena2dEnvWrapper(0, True)
    call_back = SaveOnBestTrainingRewardCallback(500, logdir, 1, reward_bound)
    # set temporary model path, if training was interrupted by the keyboard, the current model parameters will be saved.
    path_temp_model = os.path.join(logdir, "DQN_TEMP")
    if not args.restart_training:
        model = DQN(MlpPolicy,
                    env,
                    gamma=GAMMA,
                    learning_rate=LEARNING_RATE,
                    buffer_size=BUFFER_SIZE,
                    target_network_update_freq=SYNC_TARGET_STEPS,
                    tensorboard_log=logdir,
                    verbose=1)
        reset_num_timesteps = True
    else:
        if os.path.exists(path_temp_model + ".zip"):
            print("continue training the model...")
            model = DQN.load(path_temp_model, env=env)
            reset_num_timesteps = False
        else:
            print(
                "Can't load the model with the path: {}, please check again!".
                format(path_temp_model))
            env.close()
            exit(-1)
    # try:
    model.learn(time_steps,
                log_interval=200,
                callback=call_back,
                reset_num_timesteps=reset_num_timesteps)
    model.save(os.path.join(logdir, "DQN_final"))
def train_ppo(env_id,
              num_timesteps,
              seed,
              policy,
              save_params,
              n_envs=1,
              nminibatches=5,
              n_steps=8000):
    """
     env_id: typr str, identifies each environment uniquely
     num_timesteps: number of timesteps to run the algorithm
     seed: initial random seed
     policy: policy to be followed (mlp, cnn, lstm, etc)
     n_env: number of envs to run in parallel
     nminibatches: number of minibatches of mini batch gradient descent (first-order optimization) to update the policy params
     n_steps: number of steps in each update
    """
    # Train PPO algorithm for num_timesteps
    # stack the frames for the vectorized environment
    # Note: PPO2 works only with vectorized environment

    set_global_seeds(seed)
    env = make_atari(env_id)
    env.seed(seed)
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = wrap_deepmind(env, frame_stack=True)
    # define the policy
    policy = {
        'cnn': CnnPolicy,
        'lstm': CnnLstmPolicy,
        'lnlstm': CnnLnLstmPolicy,
        'mlp': MlpPolicy
    }[policy]
    # create model object for class PPO2
    model = PPO2(policy=policy,
                 env=env,
                 n_steps=n_steps,
                 nminibatches=nminibatches,
                 lam=0.95,
                 gamma=0.99,
                 noptepochs=4,
                 ent_coef=.01,
                 learning_rate=lambda f: f * 2.5e-4,
                 cliprange=lambda f: f * 0.1,
                 verbose=1)
    # train the model
    # trained for 2e7 timesteps with seed = 5
    model.learn(total_timesteps=num_timesteps, callback=callback)
    # save the hyperparameters and weights
    model.save(save_params)
    env.close()
    # free the memory
    del model
예제 #5
0
def ppo1(env_id, log_dir, timesteps):
    # Create log dir
    os.makedirs(log_dir, exist_ok=True)

    # Create and wrap the environment
    env = gym.make(env_id)
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    model = PPO1(MlpPolicy, env, verbose=0)
    # Train the agent
    print("Beginning training episodes with PPO1.")
    model.learn(total_timesteps=timesteps)

    env.close()
def train_dqn_adv(env_id, train_timesteps, seed, policy, save_params, n_envs = 1):
    set_global_seeds(seed)
    env = make_atari(env_id)
    env.seed(seed)
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = wrap_deepmind(env, frame_stack=True)
    # define the policy
    policy = {'cnn': CnnPolicy, 'mlp': MlpPolicy}[policy]
    # create model object for class DQN
    model = DQN(policy = policy, env = env, gamma=0.99, learning_rate=0.0001, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, 
                exploration_initial_eps=1.0, train_freq=4, batch_size=32, double_q=True, learning_starts=10000, target_network_update_freq=1000, 
                prioritized_replay=True, prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None, prioritized_replay_eps=1e-06, 
                param_noise=False, n_cpu_tf_sess=None, verbose=1)
    callback = save_best_model_callback(save_freq = 100, log_dir = log_dir, save_params = save_params, verbose=1)
    # train the model
    # trained for 2e7 timesteps with seed = 7
    model.learn(total_timesteps = train_timesteps, callback = callback)
    plot_results([log_dir], train_timesteps, results_plotter.X_TIMESTEPS, "DQNPong_TrainedByAdversary")
    plt.show()
    env.close()
    # free the memory
    del model
class SbTd3():
    '''stable baselines TD3'''
    def __init__(self, expt_name):
        rospack = rospkg.RosPack()
        pkg_path = rospack.get_path('deepleng_control')
        outdir = pkg_path + '/monitor_logs/' + expt_name

        # env = gym.make('LunarLanderContinuous-v2')
        env = gym.make('DeeplengDocking-v2')
        self.expt_name = expt_name
        self.env = Monitor(env, outdir)

    def __call__(self):

        policy_kwargs = dict(layers=[400, 300, 200, 100])
        n_actions = self.env.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))

        # check_env(self.env)
        model = TD3(MlpPolicy,
                    self.env,
                    policy_kwargs=policy_kwargs,
                    action_noise=action_noise,
                    memory_limit=50000,
                    tensorboard_log=
                    "/home/dfki.uni-bremen.de/mpatil/Documents/baselines_log",
                    verbose=1)

        time_steps = 3e4
        model.learn(total_timesteps=int(time_steps),
                    log_interval=50,
                    tb_log_name="td3_Docker_" + self.expt_name)
        model.save(
            "/home/dfki.uni-bremen.de/mpatil/Documents/td3_stable_baselines_" +
            self.expt_name)

        print("Closing environment")
        self.env.close()
def hardcode(env_id, log_dir, timesteps):
    # Create log dir
    os.makedirs(log_dir, exist_ok=True)

    # Create and wrap the environment
    env = gym.make(env_id)
    env = Monitor(env, log_dir, allow_early_resets=True)

    print("Running episodes with hardcoded policy.")

    inc = 0
    done = False
    while inc < timesteps:
        obs = env.reset()
        while True:
            action = policy(obs)
            obs, _, done, _ = env.step(action)
            inc += 1
            if done:
                break

    env.close()
예제 #9
0
def random_agent(env_id, log_dir, timesteps):
    # Create log dir
    os.makedirs(log_dir, exist_ok=True)

    # Create and wrap the environment
    env = gym.make(env_id)
    env = Monitor(env, log_dir, allow_early_resets=True)

    print("Running episodes with random policy.")

    # initalize timestep counter
    inc = 0

    while inc < timesteps:
        obs = env.reset()
        while True:
            # choose a random action from action_space
            action = env.action_space.sample()
            obs, _, done, _ = env.step(action)
            inc += 1
            if done:
                break

    env.close()
예제 #10
0
class SbPpo2():
    '''stable baselines PPO2'''
    def __init__(self, expt_name):
        rospack = rospkg.RosPack()
        pkg_path = rospack.get_path('deepleng_control')
        outdir = pkg_path + '/monitor_logs/' + expt_name

        # env = gym.make('LunarLanderContinuous-v2')
        env = gym.make('DeeplengDocking-v2')
        self.expt_name = expt_name
        self.env = Monitor(env, outdir)

    def __call__(self, *args, **kwargs):
        # eval_callback = EvalCallback(env, best_model_save_path=eval_dir,
        #                              log_path=eval_dir, eval_freq=500,
        #                              deterministic=True, render=False)
        policy_kwargs = dict(layers=[400, 300, 200, 100])
        model = PPO2(MlpPolicy,
                     self.env,
                     policy_kwargs=policy_kwargs,
                     verbose=1,
                     tensorboard_log=
                     "home/dfki.uni-bremen.de/mpatil/Documents/baselines_log")

        model.learn(total_timesteps=int(1e5),
                    log_interval=50,
                    tb_log_name="ppo_Docker_" + self.expt_name)

        model.save(
            "/home/dfki.uni-bremen.de/mpatil/Documents/ppo_stable_baselines_" +
            self.expt_name)

        # del model

        print("Closing environment")
        self.env.close()
예제 #11
0
    logger.configure(folder=LOGDIR)

    env = gym.make("SlimeVolley-v0")
    env = Monitor(env, LOGDIR, allow_early_resets=True)
    env.seed(n)

    model = PPO1(BnnPolicy,
                 env,
                 timesteps_per_actorbatch=4096,
                 clip_param=0.2,
                 entcoeff=0.0,
                 optim_epochs=10,
                 optim_stepsize=3e-4,
                 optim_batchsize=64,
                 gamma=0.99,
                 lam=0.95,
                 schedule='linear',
                 verbose=2)

    eval_callback = EvalCallback(env,
                                 best_model_save_path=LOGDIR,
                                 log_path=LOGDIR,
                                 eval_freq=EVAL_FREQ,
                                 n_eval_episodes=EVAL_EPISODES)

    model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    model.save(os.path.join(LOGDIR, "final_model"))

    env.close()
예제 #12
0
def main():
    global save_path, log_dir, model, best_mean_reward
    mk_dir(args.checkpoint_dir + args.policy)
    save_path = args.checkpoint_dir + args.policy + "/" + args.policy
    log_dir = args.summary_dir + args.policy
    mk_dir(log_dir)
    env = gym.make("SegmentationEnv-v0",
                   objs_dir=args.objs_dir,
                   max_scenes=args.max_scenes,
                   sample_size=args.sample_size,
                   diff_punishment=args.diff_punishment,
                   max_steps_per_scene=args.max_steps_per_scene,
                   scene_mode=args.scene_mode,
                   point_mode=args.point_mode,
                   voxel_size=args.voxel_size,
                   voxel_mode=args.voxel_mode,
                   single_scenes=args.single_scenes,
                   early_diff=args.early_diff,
                   wall_weight=args.wall_weight)
    env = Monitor(env, log_dir, allow_early_resets=True)

    env = DummyVecEnv([
        lambda: env
    ])  # The algorithms require a vectorized environment to run
    env = VecCheckNan(env, raise_exception=True)

    net_module = importlib.import_module(args.policy)
    model = PPO2(net_module.Policy,
                 env,
                 verbose=args.verbose,
                 tensorboard_log=log_dir,
                 learning_rate=args.learning_rate,
                 ent_coef=args.ent_coef,
                 cliprange=args.cliprange,
                 cliprange_vf=args.cliprange_vf,
                 lam=args.lam,
                 gamma=args.gamma,
                 seed=args.seed,
                 n_cpu_tf_sess=args.n_cpu_tf_sess,
                 noptepochs=args.noptepochs,
                 nminibatches=args.nminibatches,
                 n_steps=args.n_steps,
                 max_grad_norm=args.max_grad_norm)

    if os.path.isfile("expert_trajectories.npz") and args.pretrain == 1:
        print("------------start pretrain------------")
        #dataset = ExpertDataset(expert_path="expert_trajectories.npz", special_shape=True, traj_limitation=100, batch_size=16)
        dataset = ExpertDataset(expert_path="expert_trajectories.npz",
                                special_shape=True,
                                train_fraction=args.train_fraction,
                                batch_size=args.pretrain_batch_size)
        #model.pretrain(dataset, learning_rate=0.001, n_epochs=1000)
        model = model.pretrain(dataset,
                               val_interval=1,
                               learning_rate=args.pretrain_learning_rate,
                               n_epochs=args.pretrain_n_epochs)
        print("pretrain finished -- save model")
        model.save(save_path)
        returns = []

        print("Calculate mean reward")
        n_episodes = 10
        for i in range(n_episodes):
            total_reward = 0
            obs = env.reset()
            while True:
                action, _states = model.predict(obs, deterministic=True)
                obs, reward, done, info = env.step(action)
                total_reward += reward
                if done:
                    returns.append(total_reward)
                    break
        returns = np.array(returns)
        best_mean_reward = np.mean(returns)
        print("Best mean reward: {:.2f}".format(best_mean_reward))

    model.learn(total_timesteps=args.total_timesteps, callback=callback)
    env.close()