示例#1
0
def main():
    global model, best_model_path, last_model_path, sim_joy
    mission = 'PushStonesEnv'  # Change according to algorithm
    env = gym.make(mission + '-v0').unwrapped

    # Create log and model dir
    # dir = 'stable_bl/' + mission
    dir = 'stable_bl/PushMultipleStones'
    os.makedirs(dir + '/model_dir/sac', exist_ok=True)

    jobs = ['train', 'record', 'record-w/hm', 'BC_agent', 'play']
    job = jobs[1]
    pretrain = True

    if job == 'train':

        # create new folder
        try:
            tests = os.listdir(dir + '/model_dir/sac')
            indexes = []
            for item in tests:
                indexes.append(int(item.split('_')[1]))
            if not bool(indexes):
                k = 0
            else:
                k = max(indexes) + 1
        except FileNotFoundError:
            os.makedirs(dir + '/log_dir/sac')
            k = 0

        model_dir = os.getcwd() + '/' + dir + '/model_dir/sac/test_{}'.format(
            str(k))

        best_model_path = model_dir
        last_model_path = model_dir

        log_dir = dir + '/log_dir/sac/test_{}'.format(str(k))
        logger.configure(folder=log_dir,
                         format_strs=['stdout', 'log', 'csv', 'tensorboard'])

        num_timesteps = int(1e6)

        policy_kwargs = dict(layers=[64, 64, 64])

        # SAC - start learning from scratch
        model = SAC(sac_MlpPolicy,
                    env,
                    gamma=0.99,
                    learning_rate=1e-4,
                    buffer_size=500000,
                    learning_starts=0,
                    train_freq=1,
                    batch_size=64,
                    tau=0.01,
                    ent_coef='auto',
                    target_update_interval=1,
                    gradient_steps=1,
                    target_entropy='auto',
                    action_noise=None,
                    random_exploration=0.0,
                    verbose=2,
                    tensorboard_log=log_dir,
                    _init_setup_model=True,
                    full_tensorboard_log=True,
                    seed=None,
                    n_cpu_tf_sess=None)

        # Load best model and continue learning
        # models = os.listdir(dir + '/model_dir/sac')
        # models_rew = (model for model in models if 'rew' in model)
        # ind, reward = [], []
        # for model in models_rew:
        #     ind.append(model.split('_')[1])
        #     reward.append(model.split('_')[3])
        # best_reward = max(reward)
        # best_model_ind = reward.index(best_reward)
        # k = ind[best_model_ind]
        # model = SAC.load(dir + '/model_dir/sac/test_' + k + '_rew_' + best_reward, env=env,
        #                  custom_objects=dict(learning_starts=0))
        # Load last saved model and continue learning
        # models = os.listdir(dir + '/model_dir/sac')
        # models_time = (model for model in models if 'rew' not in model)
        # ind, hour, min = [], [], []
        # for model in models_time:
        #     ind.append(model.split('_')[1])
        #     hour.append(model.split('_')[3])
        #     min.append(model.split('_')[4])
        # date = models_time[0].split('_')[2]
        # latest_hour = max(hour)
        # latest_hour_ind = [i for i, n in enumerate(hour) if n == latest_hour]
        # latest_min = max(min[latest_hour_ind])
        # latest_min_ind = min(latest_min)
        # k = ind[latest_min_ind]
        # model = SAC.load(dir + '/model_dir/sac/test_' + k + '_' + date + '_' + latest_hour[0] + '_' + latest_min + 'zip',
        #                  env=env, custom_objects=dict(learning_starts=0))

        # model = SAC.load(dir + '/model_dir/sac/test_53_rew_24383.0',
        #                  env=env, tensorboard_log=log_dir,
        #                  custom_objects=dict(learning_starts=0, learning_rate=2e-4,
        #                                      train_freq=8, gradient_steps=4, target_update_interval=4))
        # #                                              # batch_size=32))

        # pretrain
        if pretrain:
            # load dataset only once
            # expert_dataset('3_rocks_40_episodes')
            dataset = ExpertDataset(expert_path=(os.getcwd() + '/dataset.npz'),
                                    traj_limitation=-1)
            model.pretrain(dataset, n_epochs=2000)

        # Test the pre-trained model
        # env = model.get_env()
        # obs = env.reset()
        #
        # reward_sum = 0.0
        # for _ in range(1000):
        #     action, _ = model.predict(obs)
        #     obs, reward, done, _ = env.step(action)
        #     reward_sum += reward
        #     if done:
        #         print(reward_sum)
        #         reward_sum = 0.0
        #         obs = env.reset()
        #
        # env.close()

        # learn
        model.learn(total_timesteps=num_timesteps, callback=save_fn)

        # PPO1
        # model = PPO1(Common_MlpPolicy, env, gamma=0.99, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01,
        #      optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, lam=0.95, adam_epsilon=1e-5,
        #      schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True,
        #      policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1)

        # TRPO
        # model = TRPO(MlpPolicy, env, timesteps_per_batch=4096, tensorboard_log=log_dir, verbose=1)
        # model.learn(total_timesteps=500000)
        # model.save(log_dir)

    elif job == 'record':

        mission = 'PushStonesHeatMapEnv'
        env = gym.make(mission + '-v0').unwrapped

        obs = []
        actions = []
        rewards = []
        dones = []
        episode_rewards = []

        num_episodes = 30

        listener = keyboard.Listener(on_press=on_press)
        listener.start()

        for episode in range(num_episodes):

            ob = env.reset()
            done = False
            print('Episode number ', episode + 1)
            episode_reward = 0

            while not done:

                act = "recording"
                # act = sim_joy
                # act = [0,1,0.5]
                new_ob, reward, done, info = env.step(act)

                # print(info['action'])
                # print(ob)

                if recorder_on:
                    obs.append(ob)
                    actions.append(info['action'])
                    rewards.append(reward)
                    dones.append(done)
                    episode_reward = episode_reward + reward

                ob = new_ob

            episode_rewards.append(episode_reward)

            if info['reset reason'] == 'out of boarders' or info[
                    'reset reason'] == 'limit time steps':
                episode -= 1
            else:
                print('saving data')
                data_saver(obs, actions, rewards, dones, episode_rewards)

    elif job == 'play':
        # env = gym.make('PickUpEnv-v0')
        model = SAC.load(dir + '/model_dir/sac/test_25_25_14_15',
                         env=env,
                         custom_objects=dict(learning_starts=0))  ### ADD NUM

        for _ in range(2):

            obs = env.reset()
            done = False
            while not done:
                action, _states = model.predict(obs)
                obs, reward, done, info = env.step(action)
示例#2
0

def expert(obs):
    try:
        state = State(env_depth, env_width).load_obs(obs)
        return get_behav(state, weights={'fr': 0.3})
    except NoPathError:
        return np.zeros(env_depth * 2)


# generate_expert_traj(expert, 'expert', Env(env_depth, env_width, nlayers), n_episodes=100)

# pretrain model
dataset = ExpertDataset(expert_path='expert.npz')
model = SAC('MlpPolicy', Env(env_depth, env_width, nlayers), verbose=1)
model.pretrain(dataset, n_epochs=5000)
model.save('pretrained_sac')

# Test the pre-trained model
env = model.get_env()
obs = env.reset()

reward_sum = 0
i = 0
for j in range(1000):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)
    reward_sum += reward
    i += 1
    if done:
        print(reward_sum, i, reward_sum / i)
示例#3
0
def train(env_id, algo, num_timesteps, seed, sgd_steps, t_pi, t_c, lam, log,
          expert_path, pretrain, pretrain_epochs, mdpo_update_steps,
          num_trajectories, expert_model, exploration_bonus, bonus_coef,
          random_action_len, is_action_features, dir_name, neural, lipschitz,
          args):
    """
    Train TRPO model for the mujoco environment, for testing purposes
    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """

    with tf_util.single_threaded_session():
        # from mpi4py import MPI
        # rank = MPI.COMM_WORLD.Get_rank()
        rank = 0
        env_name = env_id[:-3].lower()
        log_dir = './experiments/' + env_name + '/' + str(algo).lower() + '/'\
                  + 'tpi' + str(t_pi) + '_tc' + str(t_c) + '_lam' + str(lam)
        log_dir += '_' + dir_name + '/'
        log_name = str(algo) + '_updateSteps' + str(mdpo_update_steps)
        # log_name += '_randLen' + str(random_action_len)
        if exploration_bonus:
            log_name += '_exploration' + str(bonus_coef)
        if pretrain:
            log_name += '_pretrain' + str(pretrain_epochs)
        if not is_action_features:
            log_name += "_states_only"
        log_name += '_s' + str(seed)

        log_path = log_dir + log_name
        expert_path = './experts/' + expert_path

        num_timesteps = int(num_timesteps)

        args = args.__dict__

        dir_path = os.getcwd() + log_dir[1:]
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
            with open(os.getcwd() + log_dir[1:] + 'args.txt', 'w') as file:
                file.write("Experiment Arguments:")
                for key, val in args.items():
                    print(key, ": ", val, file=file)

        if log:
            if rank == 0:
                logger.configure(log_path)
            else:
                logger.configure(log_path, format_strs=[])
                logger.set_level(logger.DISABLED)
        else:
            if rank == 0:
                logger.configure()
            else:
                logger.configure(format_strs=[])
                logger.set_level(logger.DISABLED)

        # workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()

        # env = make_mujoco_env(env_id, workerseed)
        def make_env():
            # env_out = gym.make(env_id, reset_noise_scale=1.0)
            env_out = gym.make(env_id)
            env_out = bench.Monitor(env_out,
                                    logger.get_dir(),
                                    allow_early_resets=True)
            env_out.seed(seed)
            env_out = wrap_mujoco(env_out, random_action_len=random_action_len)
            return env_out

        #

        env = DummyVecEnv([make_env])
        # env = VecNormalize(env)

        if algo == 'Train':
            train = True
        else:
            train = False

        if algo == 'Evaluate':
            eval = True
        else:
            eval = False

        if train:
            from stable_baselines import SAC
            env = VecNormalize(env, norm_reward=False, norm_obs=False)

            if num_timesteps > 0:
                model = SAC('MlpPolicy',
                            env_id,
                            verbose=1,
                            buffer_size=1000000,
                            batch_size=256,
                            ent_coef='auto',
                            train_freq=1,
                            tau=0.01,
                            gradient_steps=1,
                            learning_starts=10000)
            else:
                model = SAC.load(expert_model, env)
            generate_expert_traj(model,
                                 expert_path,
                                 n_timesteps=num_timesteps,
                                 n_episodes=num_trajectories)
            if num_timesteps > 0:
                model.save('sac_' + env_name + '_' + str(num_timesteps))
        elif eval:
            from stable_baselines import SAC
            env = VecNormalize(env, norm_reward=False, norm_obs=False)
            model = SAC.load(expert_model, env)
            generate_expert_traj(model,
                                 expert_path,
                                 n_timesteps=num_timesteps,
                                 n_episodes=10,
                                 evaluate=True)
        else:
            expert_path = expert_path + '.npz'
            dataset = ExpertDataset(expert_path=expert_path,
                                    traj_limitation=10,
                                    verbose=1)

            if algo == 'MDAL':
                model = MDAL_MDPO_OFF('MlpPolicy',
                                      env,
                                      dataset,
                                      verbose=1,
                                      tensorboard_log="./experiments/" +
                                      env_name + "/mdal/",
                                      seed=seed,
                                      buffer_size=1000000,
                                      ent_coef=0.0,
                                      learning_starts=10000,
                                      batch_size=256,
                                      tau=0.01,
                                      gamma=0.99,
                                      gradient_steps=sgd_steps,
                                      mdpo_update_steps=mdpo_update_steps,
                                      lam=0.0,
                                      train_freq=1,
                                      d_step=10,
                                      tsallis_q=1,
                                      reparameterize=True,
                                      t_pi=t_pi,
                                      t_c=t_c,
                                      exploration_bonus=exploration_bonus,
                                      bonus_coef=bonus_coef,
                                      is_action_features=is_action_features,
                                      neural=neural,
                                      lipschitz=lipschitz)
            elif algo == 'MDAL_ON_POLICY':
                model = MDAL_MDPO_ON('MlpPolicy',
                                     env,
                                     dataset,
                                     verbose=1,
                                     timesteps_per_batch=2048,
                                     tensorboard_log="./experiments/" +
                                     env_name + "/mdal_mdpo_on/",
                                     seed=seed,
                                     max_kl=0.01,
                                     cg_iters=10,
                                     cg_damping=0.1,
                                     entcoeff=0.0,
                                     adversary_entcoeff=0.001,
                                     gamma=0.99,
                                     lam=0.95,
                                     vf_iters=5,
                                     vf_stepsize=1e-3,
                                     sgd_steps=sgd_steps,
                                     klcoeff=1.0,
                                     method="multistep-SGD",
                                     tsallis_q=1.0,
                                     t_pi=t_pi,
                                     t_c=t_c,
                                     exploration_bonus=exploration_bonus,
                                     bonus_coef=bonus_coef,
                                     is_action_features=is_action_features,
                                     neural=neural)

            elif algo == 'MDAL_TRPO':
                model = MDAL_TRPO('MlpPolicy',
                                  env,
                                  dataset,
                                  verbose=1,
                                  tensorboard_log="./experiments/" + env_name +
                                  "/mdal_trpo/",
                                  seed=seed,
                                  gamma=0.99,
                                  g_step=3,
                                  d_step=5,
                                  sgd_steps=1,
                                  d_stepsize=9e-5,
                                  entcoeff=0.0,
                                  adversary_entcoeff=0.001,
                                  max_kl=t_pi,
                                  t_pi=t_pi,
                                  t_c=t_c,
                                  exploration_bonus=exploration_bonus,
                                  bonus_coef=bonus_coef,
                                  is_action_features=is_action_features,
                                  neural=neural,
                                  lam=0.98,
                                  timesteps_per_batch=2000,
                                  lipschitz=lipschitz)

            elif algo == 'GAIL':
                from mpi4py import MPI
                from stable_baselines import GAIL

                model = GAIL('MlpPolicy',
                             env,
                             dataset,
                             verbose=1,
                             tensorboard_log="./experiments/" + env_name +
                             "/gail/",
                             seed=seed,
                             entcoeff=0.0,
                             adversary_entcoeff=0.001,
                             lipschitz=lipschitz)

            elif algo == 'GAIL_MDPO_OFF':
                # from mpi4py import MPI
                from stable_baselines import GAIL_MDPO_OFF

                model = GAIL_MDPO_OFF('MlpPolicy',
                                      env,
                                      dataset,
                                      verbose=1,
                                      tensorboard_log="./experiments/" +
                                      env_name + "/gail_mdpo_off/",
                                      seed=seed,
                                      ent_coef=0.0,
                                      adversary_entcoeff=0.001,
                                      buffer_size=1000000,
                                      learning_starts=10000,
                                      batch_size=256,
                                      tau=0.01,
                                      gamma=0.99,
                                      gradient_steps=sgd_steps,
                                      mdpo_update_steps=mdpo_update_steps,
                                      lam=0.0,
                                      train_freq=1,
                                      tsallis_q=1,
                                      reparameterize=True,
                                      t_pi=t_pi,
                                      t_c=t_c,
                                      exploration_bonus=exploration_bonus,
                                      bonus_coef=bonus_coef,
                                      is_action_features=is_action_features,
                                      lipschitz=lipschitz)
            else:
                raise ValueError("Not a valid algorithm.")

            if pretrain:
                model.pretrain(dataset, n_epochs=pretrain_epochs)

            model.learn(total_timesteps=num_timesteps, tb_log_name=log_name)

        env.close()