예제 #1
0
def symmetric_bc(model_savename,
                 bc_params,
                 num_epochs=1000,
                 lr=1e-4,
                 adam_eps=1e-8):
    """DEPRECATED: Trains two BC models from the same data. Splits data 50-50 and uses each subset as training data for
    one model and validation for the other."""
    expert_trajs = get_trajs_from_data(bc_params["data_params"])

    save_npz_file(expert_trajs, "temp")
    train_dataset = ExpertDataset(expert_path="temp",
                                  verbose=1,
                                  train_fraction=0.5)
    train_indices = train_dataset.train_loader.original_indices
    val_indices = train_dataset.val_loader.original_indices

    # Train BC model
    train_model_save_dir = model_savename + "_train/"
    bc_from_dataset_and_params(train_dataset, bc_params, train_model_save_dir,
                               num_epochs, lr, adam_eps)

    # Switching testing and validation datasets (somewhat hacky)
    indices_split = (val_indices, train_indices)
    test_dataset = ExpertDataset(expert_path="temp",
                                 verbose=1,
                                 train_fraction=0.5,
                                 indices_split=indices_split)

    # Test BC model
    test_model_save_dir = model_savename + "_test/"
    bc_from_dataset_and_params(test_dataset, bc_params, test_model_save_dir,
                               num_epochs, lr, adam_eps)
예제 #2
0
def test_dataset_param_validation():
    with pytest.raises(ValueError):
        ExpertDataset()

    traj_data = np.load(EXPERT_PATH_PENDULUM)
    with pytest.raises(ValueError):
        ExpertDataset(traj_data=traj_data, expert_path=EXPERT_PATH_PENDULUM)
예제 #3
0
def main(args):
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    env = gym.make(args.env)
    train_log_dir = os.path.join(
        args.train_log_dir,
        args.env + '_' + args.expert + '_' + args.policy_type)
    if args.expert == 'PPO':
        expert_model = PPO1(args.policy_type,
                            env,
                            verbose=1,
                            tensorboard_log=train_log_dir)
    else:
        raise NotImplementedError
    expert_model.learn(total_timesteps=args.expert_training_step)
    generate_expert_traj(expert_model,
                         os.path.join(train_log_dir, 'expert_traj'),
                         n_timesteps=1000,
                         n_episodes=args.expert_episodes)

    dataset = ExpertDataset(expert_path=os.path.join(train_log_dir,
                                                     'expert_traj.npz'),
                            traj_limitation=-1)
    gail_model = GAIL(args.policy_type,
                      env,
                      dataset,
                      verbose=1,
                      tensorboard_log=train_log_dir)
    gail_model.learn(args.student_training_step)
    evaluate(gail_model, env, num_steps=10000)
    gail_model.save(train_log_dir)
    env.close()
예제 #4
0
    def pre_train(self):
        # Using only one expert trajectory
        # you can specify `traj_limitation=-1` for using the whole dataset
        dataset = ExpertDataset(expert_path='expert_cartpole.npz',
                                traj_limitation=1,
                                batch_size=128)

        model = PPO2('MlpPolicy', 'CartPole-v1', verbose=1)
        # Pretrain the PPO2 model
        model.pretrain(dataset, n_epochs=1000)

        # As an option, you can train the RL agent
        # model.learn(int(1e5))

        # Test the pre-trained model
        env = model.get_env()
        obs = env.reset()

        reward_sum = 0.0
        for _ in range(1000):
            action, _ = model.predict(obs)
            obs, reward, done, _ = env.step(action)
            reward_sum += reward
            env.render()
            if done:
                print(reward_sum)
                reward_sum = 0.0
                obs = env.reset()

        env.close()
예제 #5
0
def main(env):

    n_actions = env.action_space.shape[0]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))

    # Using only one expert trajectory
    # you can specify `traj_limitation=-1` for using the whole dataset
    file_dir = "/home/vignesh/Thesis_Suture_data/trial2/ambf_data/"
    dataset = ExpertDataset(expert_path=file_dir + 'expert_psm_data.npz',
                            traj_limitation=1,
                            batch_size=32)

    model = DDPG(MlpPolicy,
                 env,
                 gamma=0.95,
                 verbose=1,
                 nb_train_steps=300,
                 nb_rollout_steps=150,
                 param_noise=param_noise,
                 batch_size=128,
                 action_noise=action_noise,
                 random_exploration=0.05,
                 normalize_observations=True,
                 tensorboard_log="./ddpg_dvrk_tensorboard/",
                 observation_range=(-1.5, 1.5))

    model.pretrain(dataset, n_epochs=1000)
    model.save("./gail_robot_env")
예제 #6
0
def train_agent_with_a2c(load=False):
    from stable_baselines.common.policies import MlpPolicy
    from stable_baselines.common.vec_env import SubprocVecEnv
    from stable_baselines import A2C

    # multiprocess environment
    n_cpu = 4
    env = SubprocVecEnv([lambda: gym.make('F16GCAS-v0') for i in range(n_cpu)])
    env = gym.make("F16GCAS-v0")

    class CustomPolicy(MlpPolicy):
        def __init__(self, *args, **kwargs):
            super(CustomPolicy, self).__init__(*args, **kwargs,
                                               layers=[128, 128])
    if not load:
        model = A2C(env=env, verbose=1, policy=CustomPolicy)
        # model.learn(total_timesteps=1000000)
        ExpData = ExpertDataset("./lqr_export.npz")
        model.pretrain(ExpData, n_epochs=100)
    else:
        model = A2C.load(ROOT+"/trained_models/TDRL/f16/a2c/128_128", env=env)
        with model.graph.as_default():
            for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi/'):
                print(i)

    return model
예제 #7
0
def train_agent_with_ddpg(load):
    from stable_baselines.ddpg.policies import FeedForwardPolicy
    from stable_baselines.common.vec_env import DummyVecEnv
    from stable_baselines.ddpg.noise import OrnsteinUhlenbeckActionNoise
    from stable_baselines import DDPG

    # Create and wrap the environment
    env = gym.make('F16GCAS-v0')
    env = DummyVecEnv([lambda: env])

    # the noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.01) * np.ones(n_actions))

    # Custom MLP policy of two layers of size 16 each
    class CustomPolicy(FeedForwardPolicy):
        def __init__(self, *args, **kwargs):
            super(CustomPolicy, self).__init__(*args, **kwargs,
                                               layers=[128, 128],
                                               layer_norm=False,
                                               feature_extraction="mlp")

    model = DDPG(CustomPolicy, env, verbose=1, action_noise=action_noise)

    if not load:
        ExpData = ExpertDataset("./lqr_export.npz")
        model.pretrain(ExpData, n_epochs=100)
        model.save(ROOT+"/trained_models/TDRL/f16/ddpg/128_128")
    else:
        model = DDPG.load(ROOT+"/trained_models/TDRL/f16/ddpg/128_128", policy=CustomPolicy, env=env)

    return model
예제 #8
0
def main(exp_traj_fn, rep_as_str, from_scratch):
    env_name = f"zelda-{rep_as_str}-v0"
    log_dir = f'runs/{rep_as_str}'

    kwargs_dict = {'resume': False, 'render': True}

    if rep_as_str == 'wide':
        policy = FullyConvPolicyBigMap
    else:
        policy = CustomPolicyBigMap

    env = make_vec_envs(env_name, rep_as_str, log_dir, n_cpu=1, **kwargs_dict)

    model = PPO2(policy,
                 env,
                 verbose=1,
                 tensorboard_log=f"./runs/{rep_as_str}")
    if not from_scratch:
        model.load(f'models/{rep_as_str}/zelda_{rep_as_str}', env=env)

    dataset = ExpertDataset(
        expert_path=f'expert_trajectories/{rep_as_str}/{exp_traj_fn}.npz',
        traj_limitation=-1,
        batch_size=15)
    start_time = time.process_time()
    model.set_env(env)
    model.pretrain(dataset, n_epochs=15)
    end_time = time.process_time()
    print(f"training took {end_time - start_time} seconds")
    model.save(f'models/{rep_as_str}/zelda_{rep_as_str}')
예제 #9
0
def train():

    # Load Model

    env = gym.make('roundabout-v0')

    model = DQN(MlpPolicy, env, verbose=1)
    generate_expert_traj(model, 'expert_roundabout', n_timesteps=1000, n_episodes=10)

    #Data Augmentation
    expert_data = dict(np.load('expert_roundabout.npz'))
    print("my keys are:" + str(expert_data.keys()))
    obs = expert_data['obs']
    obs.shape
    expert_data['obs'] = obs.ravel()  # convert to 1D array
    print("my keys are:" + str(expert_data.keys()))
    np.savez('expert_roundabout.npz', expert_data)

    dataset = ExpertDataset(expert_path='expert_roundabout.npz', traj_limitation=10, verbose=1)
    model = GAIL('MlpPolicy', env, dataset, verbose=1)
    model.learn(total_timesteps=1000)
    model.save("gail_roundabout")

    env.close()
    del env
예제 #10
0
def test_behavior_cloning_discrete(tmp_path, model_class):
    dataset = ExpertDataset(expert_path=EXPERT_PATH_DISCRETE, traj_limitation=10,
                            sequential_preprocessing=True, verbose=0)
    model = model_class("MlpPolicy", "CartPole-v1")
    model.pretrain(dataset, n_epochs=10)
    model.save(str(tmp_path / "test-pretrain"))
    del dataset, model
예제 #11
0
def test_gail(expert_env):
    env_id, expert_path = expert_env
    env = gym.make(env_id)
    dataset = ExpertDataset(expert_path=expert_path,
                            traj_limitation=10,
                            sequential_preprocessing=True)

    # Note: train for 1M steps to have a working policy
    model = GAIL('MlpPolicy',
                 env,
                 adversary_entcoeff=0.0,
                 lam=0.92,
                 max_kl=0.001,
                 expert_dataset=dataset,
                 hidden_size_adversary=64,
                 verbose=0)

    model.learn(1000)
    model.save("GAIL-{}".format(env_id))
    model = model.load("GAIL-{}".format(env_id), env=env)
    model.learn(1000)

    obs = env.reset()

    for _ in range(1000):
        action, _ = model.predict(obs)
        obs, _, done, _ = env.step(action)
        if done:
            obs = env.reset()
    del dataset, model
예제 #12
0
def test_gail(tmp_path, expert_env):
    env_id, expert_path, load_from_memory = expert_env
    env = gym.make(env_id)

    traj_data = None
    if load_from_memory:
        traj_data = np.load(expert_path)
        expert_path = None
    dataset = ExpertDataset(traj_data=traj_data,
                            expert_path=expert_path,
                            traj_limitation=10,
                            sequential_preprocessing=True)

    # Note: train for 1M steps to have a working policy
    model = GAIL('MlpPolicy',
                 env,
                 adversary_entcoeff=0.0,
                 lam=0.92,
                 max_kl=0.001,
                 expert_dataset=dataset,
                 hidden_size_adversary=64,
                 verbose=0)

    model.learn(300)
    model.save(str(tmp_path / "GAIL-{}".format(env_id)))
    model = model.load(str(tmp_path / "GAIL-{}".format(env_id)), env=env)
    model.learn(300)

    evaluate_policy(model, env, n_eval_episodes=5)
    del dataset, model
예제 #13
0
def imitate(model, expert_path, model_path, learning_rate, n_epochs=1000):

    dataset = ExpertDataset(expert_path=expert_path + '.npz', batch_size=128)

    model.pretrain(dataset, n_epochs=n_epochs, learning_rate=learning_rate)

    model.save(model_path)
예제 #14
0
def train(env, implemented_combos, model_logdir, arg_dict, pretrained_model=None):
    model_name = arg_dict["algo"] + '_' + str(arg_dict["steps"])
    conf_pth   = os.path.join(model_logdir, "train.json")
    model_path = os.path.join(model_logdir, "best_model.zip")
    arg_dict["model_path"] = model_path
    with open(conf_pth, "w") as f:
        json.dump(arg_dict, f, indent=4)

    model_args = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][1]
    model_kwargs = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][2]
    if pretrained_model:
        if not os.path.isabs(pretrained_model):
            pretrained_model = pkg_resources.resource_filename("myGym", pretrained_model)
        env = model_args[1]
        vec_env = DummyVecEnv([lambda: env])
        model = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][0].load(pretrained_model, vec_env)
    else:
        model = implemented_combos[arg_dict["algo"]][arg_dict["train_framework"]][0](*model_args, **model_kwargs)

    if arg_dict["algo"] == "gail":
        # Multi processing: (using MPI)
        if arg_dict["train_framework"] == 'tensorflow':
            # Generate expert trajectories (train expert)
            generate_expert_traj(model, model_name, n_timesteps=3000, n_episodes=100)
            # Load the expert dataset
            dataset = ExpertDataset(expert_path=model_name+'.npz', traj_limitation=10, verbose=1)
            model = GAIL_T('MlpPolicy', model_name, dataset, verbose=1)
            # Note: in practice, you need to train for 1M steps to have a working policy

    start_time = time.time()
    callbacks_list = []
    if pretrained_model:
        model_logdir = pretrained_model.split('/')
        model_logdir = model_logdir[:-1]
        model_logdir = "/".join(model_logdir)
        auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1024, logdir=model_logdir, env=env, engine=arg_dict["engine"], multiprocessing=arg_dict["multiprocessing"])
    else:
        auto_save_callback = SaveOnBestTrainingRewardCallback(check_freq=1024, logdir=model_logdir, env=env, engine=arg_dict["engine"], multiprocessing=arg_dict["multiprocessing"])
    callbacks_list.append(auto_save_callback)
    if arg_dict["eval_freq"]:
        eval_env = configure_env(arg_dict, model_logdir, for_train=False)
        eval_callback = CustomEvalCallback(eval_env, log_path=model_logdir,
                                           eval_freq=arg_dict["eval_freq"],
                                           n_eval_episodes=arg_dict["eval_episodes"],
                                           record=arg_dict["record"],
                                           camera_id=arg_dict["camera"])
        callbacks_list.append(eval_callback)
    #callbacks_list.append(PlottingCallback(model_logdir))
    with ProgressBarManager(total_timesteps=arg_dict["steps"]) as progress_callback:
        callbacks_list.append(progress_callback)
        model.learn(total_timesteps=arg_dict["steps"], callback=callbacks_list)
    model.save(os.path.join(model_logdir, model_name))
    print("Training time: {:.2f} s".format(time.time() - start_time))

    # info_keywords in monitor class above is neccessary for pybullet to save_results
    # when using the info_keywords for mujoco we get an error
    if arg_dict["engine"] == "pybullet":
        save_results(arg_dict, model_name, env, model_logdir)
    return model
예제 #15
0
def test_gail_callback(tmp_path):
    dataset = ExpertDataset(expert_path=EXPERT_PATH_PENDULUM, traj_limitation=10,
                            sequential_preprocessing=True, verbose=0)
    model = GAIL("MlpPolicy", "Pendulum-v0", dataset)
    checkpoint_callback = CheckpointCallback(save_freq=500, save_path=str(tmp_path / 'logs/gail/'), name_prefix='gail')
    model.learn(total_timesteps=1000, callback=checkpoint_callback)
    shutil.rmtree(str(tmp_path / 'logs/gail/'))
    del dataset, model
예제 #16
0
def test_pretrain_twice(tmp_path):
    """
    Test pretraining twice in the same execution.
    """
    dataset = ExpertDataset(expert_path=EXPERT_PATH_PENDULUM, traj_limitation=10,
                            sequential_preprocessing=True, verbose=0)
    model = PPO2("MlpPolicy", "Pendulum-v0")
    model.pretrain(dataset, n_epochs=5)
    model.pretrain(dataset, n_epochs=5)
    del dataset, model
예제 #17
0
def test_behavior_cloning_box(tmp_path, model_class):
    """
    Behavior cloning with continuous actions.
    """
    dataset = ExpertDataset(expert_path=EXPERT_PATH_PENDULUM, traj_limitation=10,
                            sequential_preprocessing=True, verbose=0)
    model = model_class("MlpPolicy", "Pendulum-v0")
    model.pretrain(dataset, n_epochs=20)
    model.save(str(tmp_path / "test-pretrain"))
    del dataset, model
def train_bc_agent(model_save_dir, bc_params, num_epochs=1000, lr=1e-4, adam_eps=1e-8):
    # Extract necessary expert data and save in right format
    expert_trajs = get_trajs_from_data(**bc_params["data_params"])
    
    # Load the expert dataset
    save_npz_file(expert_trajs, "temp.npz")
    dataset = ExpertDataset(expert_path="temp.npz", verbose=1, train_fraction=0.85)
    assert dataset is not None
    assert dataset.train_loader is not None
    return bc_from_dataset_and_params(dataset, bc_params, model_save_dir, num_epochs, lr, adam_eps)
def train(params):

    # create model
    env = FlattenObservation(gym.make(params.get("environment")))
    exp_name = params.get("model_name") + "_train_" + params.get("environment")
    log_dir = './logs/' + exp_name
    expert_name = 'expert_{0}'.format(exp_name)

    if params.get("model_name") == 'TRPO':
        print("Loading TRPO Model")
        model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
        model.learn(total_timesteps=params.get("train_steps"))
        model.save(exp_name)

    if params.get("model_name") == 'PPO':
        print("Loading PPO Model")
        model = PPO1(MlpPolicy,
                     env,
                     verbose=1,
                     tensorboard_log=log_dir,
                     entcoeff=params.get("ent_coef"),
                     gamma=params.get("gamma"),
                     optim_batchsize=params.get("batch_size"),
                     clip_param=params.get("clip_range"),
                     lam=params.get("gae_lambda"))
        model.learn(total_timesteps=params.get("train_steps"))
        model.save(exp_name)

    if params.get("expert_exists") is False:
        print("Training expert trajectories")
        # Train expert controller (if needed) and record expert trajectories.
        generate_expert_traj(model,
                             expert_name,
                             n_timesteps=params.get("expert_timesteps"),
                             n_episodes=params.get("n_episodes"))

    dataset = ExpertDataset(
        expert_path='{0}.npz'.format(expert_name),
        traj_limitation=-1,
        randomize=True,  # if the dataset should be shuffled
        verbose=1)

    model = GAIL('MlpPolicy', env, dataset, verbose=1,
                 tensorboard_log=log_dir)  # Check out for defaults

    if params.get("pre_train") is True:
        print("Pretraining Dataset with Behavioural Cloning")
        model.pretrain(dataset, n_epochs=10000)

    print("Executing GAIL Learning")
    model.learn(total_timesteps=params.get("train_steps"))
    model.save("BC" + exp_name)

    env.close()
    del env
예제 #20
0
 def _pretrain(self):
     if self.config['meta'].get('pretrain', None):
         logging.info("Starting pretraining.")
         pretrain_config = copy.deepcopy(self.config['meta']['pretrain'])
         archive_location = pretrain_config.get('expert_path')
         n_epochs = pretrain_config.pop('n_epochs', 1000)
         assert os.path.exists(
             archive_location
         ), "Could not find archive with pretraining data at {}".format(
             archive_location)
         dataset = ExpertDataset(**pretrain_config)
         self.agent.pretrain(dataset, n_epochs=n_epochs)
예제 #21
0
def get_expert_dataset(
    expert,
    venv,
    total_timesteps,
):
    filename = f"/tmp/{uuid.uuid4()}"
    n_episodes = total_timesteps // get_horizon(venv)

    generate_expert_traj(expert,
                         save_path=filename,
                         env=venv,
                         n_episodes=n_episodes)
    dataset = ExpertDataset(expert_path=f"{filename}.npz", verbose=0)

    return dataset
예제 #22
0
def train_gail_withppo2():
    env = gimbal(5, 500)
    env = DummyVecEnv([lambda: env])
    model = PPO2.load("./models/baseline_ppo2_t1")
    generate_expert_traj(model,
                         './models/baseline_expert_t1',
                         env,
                         n_timesteps=0,
                         n_episodes=100)
    dataset = ExpertDataset(expert_path='./models/baseline_expert_t1.npz',
                            traj_limitation=-1,
                            verbose=1)
    model = GAIL("MlpPolicy", env, dataset, verbose=1)
    model.learn(total_timesteps=500000)
    model.save("./models/baseline_gail_ppo2_t1")
예제 #23
0
def test_pretrain_images(tmp_path):
    env = make_atari_env("PongNoFrameskip-v4", num_env=1, seed=0)
    env = VecFrameStack(env, n_stack=4)
    model = PPO2('CnnPolicy', env)
    generate_expert_traj(model, str(tmp_path / 'expert_pong'), n_timesteps=0, n_episodes=1,
                         image_folder=str(tmp_path / 'pretrain_recorded_images'))

    expert_path = str(tmp_path / 'expert_pong.npz')
    dataset = ExpertDataset(expert_path=expert_path, traj_limitation=1, batch_size=32,
                            sequential_preprocessing=True)
    model.pretrain(dataset, n_epochs=2)

    shutil.rmtree(str(tmp_path / 'pretrain_recorded_images'))
    env.close()
    del dataset, model, env
예제 #24
0
    def pre_train(self, num_e=1, load="saves/m19"):
        env_id = 'default'
        num_e = 1
        log_dir = "saves"
        # Usingenv = make_env() only one expert trajectory
        # you can specify `traj_limitation=-1` for using the whole dataset
        dataset = ExpertDataset(expert_path='default2.npz',
                                traj_limitation=1,
                                batch_size=128)
        self.env = SubprocVecEnv(
            [self.make_env(env_id, i) for i in range(num_e)])
        #env = Template_Gym()
        #self.env = DummyVecEnv([lambda: env])
        #self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True)
        #env = make_env()
        #model = GAIL("MlpPolicy", env=env, expert_dataset=dataset, verbose=1)
        #self.env.save_running_average("saves"+self.config.pair)
        self.model = PPO2(MlpPolicy,
                          self.env,
                          verbose=1,
                          nminibatches=1,
                          learning_rate=1e-5,
                          tensorboard_log="./m1ln4")
        #self.model = PPO2.load("saves/m19", self.env, policy=CustomPolicy, tensorboard_log="./default/" )
        #self.env.save_running_average("saves"+self.config.pair)
        # Pretrain the PPO2 model
        self.model.pretrain(dataset, n_epochs=10000)

        # As an option, you can train the RL agent
        #self.model.learn(int(100000000))

        # Test the pre-trained model
        self.env = self.model.get_env()
        #self.env.save_running_average("saves"+self.config.pair)
        obs = self.env.reset()

        reward_sum = 0.0
        for _ in range(11):
            action, _ = self.model.predict(obs)
            obs, reward, done, _ = self.env.step(action)
            reward_sum += reward
            #self.env.render()
            if done:
                print(reward_sum)
                reward_sum = 0.0
                obs = self.env.reset()

        self.env.close()
예제 #25
0
def main(args):
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device

    # train expert model for multiple times and save the best model
    best_reward = -np.inf
    train_env = make_vec_env(args.env, n_envs=args.n_env)
    eval_env = gym.make(args.env)

    for i in range(args.times_expert):
        train_env.reset()
        train_log_dir = os.path.join(args.train_log_dir,
                                     args.env + '_' + args.expert)
        if args.expert == 'PPO':
            expert_model = PPO2(args.policy_type, env=train_env, n_steps=args.n_steps, nminibatches=args.nminibatches, noptepochs=args.noptepochs, ent_coef=args.ent_coef,\
                            lam=args.lam, gamma=args.gamma, cliprange=args.cliprange, learning_rate=args.learning_rate, verbose=1, tensorboard_log=train_log_dir)
        else:
            raise NotImplementedError
        expert_model.learn(total_timesteps=args.expert_training_step)
        mean_reward = evaluate(expert_model, eval_env, num_steps=10000)
        if mean_reward > best_reward:
            best_reward = mean_reward
            expert_model.save(
                os.path.join(args.train_log_dir, args.env + '_expert'))
        del expert_model
    train_env.reset()
    expert_model = PPO2.load(os.path.join(args.train_log_dir,
                                          args.env + '_expert'),
                             env=train_env)
    generate_expert_traj(expert_model,
                         os.path.join(train_log_dir, 'expert_traj'),
                         n_timesteps=-1,
                         n_episodes=args.expert_episodes)
    train_env.close()

    dataset = ExpertDataset(expert_path=os.path.join(train_log_dir,
                                                     'expert_traj.npz'),
                            traj_limitation=-1)
    gail_model = GAIL(args.policy_type,
                      args.env,
                      dataset,
                      verbose=1,
                      tensorboard_log=train_log_dir)
    gail_model.learn(args.student_training_step)

    evaluate(gail_model, eval_env, num_steps=10000)
    gail_model.save(os.path.join(args.train_log_dir, args.env + '_GAIL'))
    eval_env.close()
예제 #26
0
def run_gail():
    parser = argparse.ArgumentParser()
    parser.add_argument('expert',
                        type=str,
                        default=None,
                        help='Expert path (*.npz)')
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--note', type=str, default='test')
    parser.add_argument('--env', type=str, default='PongNoFrameskip-v4')
    parser.add_argument('--num-steps', type=int, default=1000000)
    parser.add_argument('--policy',
                        type=str,
                        default='CnnPolicy',
                        choices=[
                            'CnnPolicy', 'CnnLstmPolicy', 'CnnLnLstmPolicy',
                            'MlpPolicy', 'MlpLstmPolicy', 'MlpLnLstmPolicy'
                        ],
                        help='Policy architecture')
    args = parser.parse_args()

    logger.configure(os.path.join('logs', args.env, args.note))
    logger.info(args)

    if 'NoFrameskip' in args.env:
        env = VecFrameStack(make_atari_env(args.env, 1, args.seed), 4)
    else:
        import gym
        env = gym.make(args.env)

    dataset = ExpertDataset(expert_path=args.expert,
                            batch_size=128,
                            train_fraction=0.99,
                            verbose=1)
    model = GAIL(args.policy,
                 env,
                 dataset,
                 timesteps_per_batch=1280,
                 verbose=1)
    model.learn(len(dataset.train_loader) * 1280)
예제 #27
0
def main():
    env = gym.make("BowlingNoFrameskip-v0")
    env = MaxAndSkipEnv(env, skip=4)
    env = WarpFrame(env)
    env = DummyVecEnv([lambda: env])

    dataset = ExpertDataset(expert_path="bowling_demo.npz", verbose=1)

    model = PPO2("CnnPolicy", env, verbose=1)
    model.pretrain(dataset, n_epochs=1000)
    model.save("bowling_model")

    state = env.reset()
    total_reward = 0
    while True:
        env.render()
        time.sleep(1 / 60)
        action, _ = model.predict(state)
        state, reward, done, info = env.step(action)
        total_reward += reward[0]
        if done:
            print(total_reward)
            state = env.reset()
            total_reward = 0
예제 #28
0
def trian_agent_with_gail(load):
    from stable_baselines.common.policies import MlpPolicy
    from stable_baselines import GAIL

    env = gym.make("F16GCAS-v0")

    class CustomPolicy(MlpPolicy):
        def __init__(self, *args, **kwargs):
            super(CustomPolicy, self).__init__(*args, **kwargs,
                                               layers=[128, 128])
    if not load:
        ExpData = ExpertDataset("./lqr_export.npz")
        model = GAIL(CustomPolicy, env, ExpData, verbose=1)
        model.learn(total_timesteps=1000000)
        model.save(ROOT+"/trained_models/TDRL/f16/gail/128_128")
    else:
        # with model.graph.as_default():
        #     for i in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='model/pi/'):
        #         print(i)
        model = GAIL.load(ROOT+"/trained_models/TDRL/f16/gail/128_128", env=env)
        with model.graph.as_default():
            print(tf.all_variables())

    return model
예제 #29
0
def main():
    global save_path, log_dir, model, best_mean_reward
    mk_dir(args.checkpoint_dir + args.policy)
    save_path = args.checkpoint_dir + args.policy + "/" + args.policy
    log_dir = args.summary_dir + args.policy
    mk_dir(log_dir)
    env = gym.make("SegmentationEnv-v0",
                   objs_dir=args.objs_dir,
                   max_scenes=args.max_scenes,
                   sample_size=args.sample_size,
                   diff_punishment=args.diff_punishment,
                   max_steps_per_scene=args.max_steps_per_scene,
                   scene_mode=args.scene_mode,
                   point_mode=args.point_mode,
                   voxel_size=args.voxel_size,
                   voxel_mode=args.voxel_mode,
                   single_scenes=args.single_scenes,
                   early_diff=args.early_diff,
                   wall_weight=args.wall_weight)
    env = Monitor(env, log_dir, allow_early_resets=True)

    env = DummyVecEnv([
        lambda: env
    ])  # The algorithms require a vectorized environment to run
    env = VecCheckNan(env, raise_exception=True)

    net_module = importlib.import_module(args.policy)
    model = PPO2(net_module.Policy,
                 env,
                 verbose=args.verbose,
                 tensorboard_log=log_dir,
                 learning_rate=args.learning_rate,
                 ent_coef=args.ent_coef,
                 cliprange=args.cliprange,
                 cliprange_vf=args.cliprange_vf,
                 lam=args.lam,
                 gamma=args.gamma,
                 seed=args.seed,
                 n_cpu_tf_sess=args.n_cpu_tf_sess,
                 noptepochs=args.noptepochs,
                 nminibatches=args.nminibatches,
                 n_steps=args.n_steps,
                 max_grad_norm=args.max_grad_norm)

    if os.path.isfile("expert_trajectories.npz") and args.pretrain == 1:
        print("------------start pretrain------------")
        #dataset = ExpertDataset(expert_path="expert_trajectories.npz", special_shape=True, traj_limitation=100, batch_size=16)
        dataset = ExpertDataset(expert_path="expert_trajectories.npz",
                                special_shape=True,
                                train_fraction=args.train_fraction,
                                batch_size=args.pretrain_batch_size)
        #model.pretrain(dataset, learning_rate=0.001, n_epochs=1000)
        model = model.pretrain(dataset,
                               val_interval=1,
                               learning_rate=args.pretrain_learning_rate,
                               n_epochs=args.pretrain_n_epochs)
        print("pretrain finished -- save model")
        model.save(save_path)
        returns = []

        print("Calculate mean reward")
        n_episodes = 10
        for i in range(n_episodes):
            total_reward = 0
            obs = env.reset()
            while True:
                action, _states = model.predict(obs, deterministic=True)
                obs, reward, done, info = env.step(action)
                total_reward += reward
                if done:
                    returns.append(total_reward)
                    break
        returns = np.array(returns)
        best_mean_reward = np.mean(returns)
        print("Best mean reward: {:.2f}".format(best_mean_reward))

    model.learn(total_timesteps=args.total_timesteps, callback=callback)
    env.close()
예제 #30
0
# generate expert trajectory
env_depth, env_width, nlayers = 3, 3, 2


def expert(obs):
    try:
        state = State(env_depth, env_width).load_obs(obs)
        return get_behav(state, weights={'fr': 0.3})
    except NoPathError:
        return np.zeros(env_depth * 2)


# generate_expert_traj(expert, 'expert', Env(env_depth, env_width, nlayers), n_episodes=100)

# pretrain model
dataset = ExpertDataset(expert_path='expert.npz')
model = SAC('MlpPolicy', Env(env_depth, env_width, nlayers), verbose=1)
model.pretrain(dataset, n_epochs=5000)
model.save('pretrained_sac')

# Test the pre-trained model
env = model.get_env()
obs = env.reset()

reward_sum = 0
i = 0
for j in range(1000):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)
    reward_sum += reward
    i += 1