예제 #1
0
def test_model_manipulation(model_class, goal_selection_strategy):
    env = BitFlippingEnv(N_BITS,
                         continuous=model_class in [DDPG, SAC],
                         max_steps=N_BITS)
    env = DummyVecEnv([lambda: env])

    model = HER('MlpPolicy',
                env,
                model_class,
                n_sampled_goal=3,
                goal_selection_strategy=goal_selection_strategy,
                verbose=0)
    model.learn(1000)

    model_predict(model, env, n_steps=100, additional_check=None)

    model.save('./test_her')
    del model

    # NOTE: HER does not support VecEnvWrapper yet
    with pytest.raises(AssertionError):
        model = HER.load('./test_her', env=VecNormalize(env))

    model = HER.load('./test_her')

    # Check that the model raises an error when the env
    # is not wrapped (or no env passed to the model)
    with pytest.raises(ValueError):
        model.predict(env.reset())

    env_ = BitFlippingEnv(N_BITS,
                          continuous=model_class in [DDPG, SAC],
                          max_steps=N_BITS)
    env_ = HERGoalEnvWrapper(env_)

    model_predict(model, env_, n_steps=100, additional_check=None)

    model.set_env(env)
    model.learn(1000)

    model_predict(model, env_, n_steps=100, additional_check=None)

    assert model.n_sampled_goal == 3

    del model

    env = BitFlippingEnv(N_BITS,
                         continuous=model_class in [DDPG, SAC],
                         max_steps=N_BITS)
    model = HER.load('./test_her', env=env)
    model.learn(1000)

    model_predict(model, env_, n_steps=100, additional_check=None)

    assert model.n_sampled_goal == 3

    if os.path.isfile('./test_her.pkl'):
        os.remove('./test_her.pkl')
예제 #2
0
def main(load_policy=True):
    global log_dir
    model_class = TD3  # works also with SAC and DDPG
    action_space = 6
    gamma = 0.9
    memory_limit = 1000000
    timesteps = 15000000
    discreteAction = 0
    rend = False
    # learning rate


    env = bioEnv()
  
    env = Monitor(env, log_dir, allow_early_resets=True)

    goal_selection_strategy = 'future'
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    # Wrap the model

    model = HER(CustomTD3Policy, env, model_class,n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=1,tensorboard_log="../pybullet_logs/bioEnv_TD3", buffer_size=1000000,batch_size= 256,
                random_exploration=0.3, action_noise=action_noise)
    
    if (load_policy):
        model = HER.load("models/TD3/curriculum/best_model_part_11_10g_TRUE.pkl", env=env, n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        tensorboard_log="../pybullet_logs/bioEnv_TD3",
        buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise)
    
    model.learn(timesteps,log_interval=100, callback = callback)
   
    model.save("policy_TD3_Discr")
예제 #3
0
def main(env_id):
    env_type = 'robotics'
    env = gym.make(env_id)

    save_file = "/home/shivanik/fetch_trial.zip"
    video_file = "/home/shivanik/fetch_her_videos/"
    env_recorder = NonVecRecorder(env)

    video_len = 100

    model = HER.load(save_file, env=env)
    action_spec = env.action_space
    env.reset()
    i = 0
    record = False
    num_files = 4
    obs = env.reset()
    for i in range(num_files):
        fname = video_file + "%d.mp4" % i
        print(fname)
        env_recorder.init_video_writing(fname=fname)
        for j in range(video_len):
            action, _ = model.predict(obs)
            obs, reward, done, _ = env.step(action)
            env_recorder.viz(True)
            if done:
                obs = env.reset()
        env_recorder.close()
        env.reset()

    env.close()
예제 #4
0
def load_model(eval_env):
  # WARNING: you must pass an env
  # or wrap your environment with HERGoalEnvWrapper to use the predict method
  model = HER.load('./her_robot_env', env=eval_env)
  count = 0
  step_num_arr = []
  for _ in range(20):
    number_steps = 0
    obs = eval_env.reset()
    for _ in range(400):
      action, _ = model.predict(obs)
      obs, reward, done, _ = eval_env.step(action)
      number_steps += 1
      # print(obs['achieved_goal'][0:3], obs['desired_goal'][0:3], reward)
      if done:
        step_num_arr.append(number_steps)
        count += 1
        print("----------------It reached terminal state -------------------")
        break
  print(
    "Robot reached the goal position successfully ",
    count,
    " times and the Average step count was ",
    np.average(np.array(step_num_arr))
  )
def main(load_policy=False):
    global log_dir
    model_class = TD3  # works also with SAC and DDPG
    action_space = 7
    normalize_observations = False
    gamma = 0.9
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 8000000
    rend = False

    obj_pose_rnd_std = 0

    env = pandaPushGymGoalEnv(renders=rend,
                              use_IK=0,
                              numControlledJoints=action_space,
                              obj_pose_rnd_std=obj_pose_rnd_std,
                              includeVelObs=True)

    env = Monitor(env, log_dir, allow_early_resets=True)

    goal_selection_strategy = 'future'
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))
    # Wrap the model

    model = HER(
        CustomTD3Policy,
        env,
        model_class,
        n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        verbose=1,
        tensorboard_log=
        "../pybullet_logs/panda_push_TD3/stable_baselines/TD3_phase1_target_fixed",
        buffer_size=1000000,
        batch_size=256,
        random_exploration=0.3,
        action_noise=action_noise)

    if (load_policy):
        model = HER.load(
            "../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl",
            env=env,
            n_sampled_goal=4,
            goal_selection_strategy=goal_selection_strategy,
            tensorboard_log=
            "../pybullet_logs/panda_push_TD3/stable_baselines/TD3_phase1_target_fixed",
            buffer_size=1000000,
            batch_size=256,
            random_exploration=0.3,
            action_noise=action_noise)

    model.learn(timesteps, log_interval=100, callback=callback)
    print("Saving Policy PHASE_1")
    model.save("../policies/TD3_phase1_target_fixed")
예제 #6
0
def test_HER( env, out_dir, seed=None, **kwargs):

  model = HER.load(os.path.join(out_dir,'final_model.pkl'), env=env)

  #model.learn(total_timesteps=10000)
  # Evaluate the trained agent
  mean_reward = evaluate(env, model, num_steps=5000)

  return
def launchAgent(model_name: str):
    """
    :param model_name: 실행시킬 모델의 종류. HER, DDPG, PPO2 혹은 기타값(DQN)이어야 함
                        현재는 의도상 PPO2로 세팅할 것
    :return: 1000회의 사이클을 돌고 난 이후의 모델
    """
    import Reinforcement_AI.env.e_enhanced_image_env as image_env
    from stable_baselines import DQN, HER, DDPG, PPO2
    from stable_baselines.common import make_vec_env

    print("Current Env is " + model_name)

    if model_name == "HER":
        env = image_env.DetailedMiniMapEnv()
        model = HER("CnnPolicy", env=env, model_class=DQN)
    if model_name == "DDPG":
        env = image_env.DDPGImageEnv()
        model = DDPG(policy="CnnPolicy", env=env, normalize_observations=True)
    if model_name == "PPO2":
        env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1)
        model = PPO2(policy="CnnPolicy", env=env, verbose=1)
    else:
        env = image_env.DetailedMiniMapEnv()
        model = DQN(
            "CnnPolicy",  # policy
            env=env,  # environment
            double_q=True,  # Double Q enable
            prioritized_replay=True,  # Replay buffer enabled
            verbose=0  # log print
        )

    for i in range(1000):
        if i != 0:
            if model_name == "HER":
                model = HER.load("detailedmap_HER_" + str(i), env)
            if model_name == "DDPG":
                model = DDPG.load("detailedmap_DDPG_" + str(i), env)
            if model_name == "PPO2":
                model = PPO2.load("detailedmap_PPO2_" + str(i), env)
            else:
                model = DQN.load("detailedmap_DQN_" + str(i), env)

        # print('model learn start')
        model.learn(total_timesteps=12500)  #FPS가 130이상 넘어갈때의 최소수치
        print("this model is : detailedmap_" + model_name + "_" + str(i + 1))
        # print('model learn finished')

        # print('model save start')
        model.save("detailedmap_" + model_name + "_" + str(i + 1))
        del model
        # print('model save end')

    return model
예제 #8
0
    def train_curriculum_fetch(self, env_name="Merging-v0"):
        """
        Trains reward curriculum
        """
        self.curriculum = [env_name]
        bs2model_ours = {'RL': BR_BL0_BL1_BL5, 'LR': BL_BR0}
        bs2model = {'RL': BR_s, 'LR': BL_s}
        for l, lesson in enumerate(self.curriculum):
            for bs in bs2model.keys():
                self.bs = bs
                for seed in [101, 102]:
                    if self.expt_type == "ours":
                        model_info = bs2model_ours[self.bs]
                    else:
                        model_info = bs2model[self.bs]
                    model_dir = os.path.join(model_info[0], model_info[1],
                                             model_info[2])
                    if self.model_type == "PPO":
                        self.model = PPO2.load(
                            model_dir)  # loads pre-trained model
                    elif self.model_type == "HER":
                        self.model = HER.load(
                            model_dir)  # loads pre-trained model
                    print(f"\ntraining on {lesson}, bs {self.bs}, seed{seed}")
                    self.seed = seed
                    self.experiment_name = f"{self.bs}_{self.expt_type}_{seed}"
                    print("EXPT NAME: ", self.experiment_dir1,
                          self.experiment_name)
                    self.experiment_dir = os.path.join(self.experiment_dir1,
                                                       self.experiment_name)
                    self.create_eval_dir()
                    env = gym.make(lesson)
                    eval_env = gym.make(lesson)

                    if self.bs == 'RL':
                        env._set_homotopy_class('left')
                        eval_env._set_homotopy_class('left')
                    elif self.bs == 'LR':
                        env._set_homotopy_class('right')
                        eval_env._set_homotopy_class('right')

                    if self.model_type == "HER":
                        env = HERGoalEnvWrapper(env)
                        eval_env = HERGoalEnvWrapper(eval_env)
                        print("hc: ", env.env.homotopy_class)
                    else:
                        env = DummyVecEnv([lambda: env])
                    self.model.set_env(env)
                    self.model.seed = self.seed
                    self.model = train(self.model, eval_env, self.timesteps,
                                       self.experiment_dir, self.is_save,
                                       self.eval_save_period, self.rets_path,
                                       l)
    def train_curriculum(self, env_name="Merging-v0"):
        """
        Trains reward curriculum
        """
        self.curriculum = [env_name]
        bs2model_ours = {1: B1R_B0L, 3: B3R_B0L, 5: B5R_B0L2, 7: B7R_B0L_B4L1}
        bs2model = {1: B1R, 3: B3R, 5: B5R, 7: B7R}
        for l, lesson in enumerate(self.curriculum):
            for seed in [201, 202, 203, 204, 205]:
                if self.expt_type == "ours":
                    model_info = bs2model_ours[int(self.bs)]
                elif self.expt_type == "finetune":
                    model_info = bs2model[int(self.bs)]
                model_dir = os.path.join(model_info[0], model_info[1],
                                         model_info[2])
                if self.model_type == "PPO":
                    self.model = PPO2.load(
                        model_dir)  # loads pre-trained model
                elif self.model_type == "HER":
                    self.model = HER.load(model_dir)  # loads pre-trained model
                print(f"\ntraining on {lesson}, bs {self.bs}, seed{seed}")
                self.seed = seed
                self.experiment_name = f"{self.bs}_{self.expt_type}_{seed}"
                print("EXPT NAME: ", self.experiment_dir1,
                      self.experiment_name)
                self.experiment_dir = os.path.join(self.experiment_dir1,
                                                   self.experiment_name)
                self.create_eval_dir()
                env = gym.make(lesson)
                eval_env = gym.make(lesson)

                env._set_barrier_size(self.bs)
                env._set_homotopy_class('left')
                eval_env._set_barrier_size(self.bs)
                eval_env._set_homotopy_class('left')

                if self.model_type == "HER":
                    env = HERGoalEnvWrapper(env)
                    eval_env = HERGoalEnvWrapper(eval_env)
                    print("bs: ", env.env.barrier_size)
                    print("hc: ", env.env.homotopy_class)
                else:
                    env = DummyVecEnv([lambda: env])
                self.model.set_env(env)
                self.model.set_random_seed(self.seed)
                ### ENTROPY###
                #self.model.ent_coef = 0.05
                self.model = train(self.model, eval_env, self.timesteps,
                                   self.experiment_dir, self.is_save,
                                   self.eval_save_period, self.rets_path, l)
예제 #10
0
    def __init__(self, env_id, exp_id, model_path, trajectory_type,
                 episode_timesteps, noise_parameters):
        self.env_id = env_id
        self.exp_id = exp_id
        self.trajectory_type = trajectory_type
        # Load model and environment
        self.env = HERGoalEnvWrapper(
            gym.make(env_id, **{'noise_parameters': noise_parameters}))
        self.model = HER.load(model_path, env=self.env)
        self.episode_timesteps = episode_timesteps

        # Setup subscriber for trajectory generator
        # self.line_trajectory_timer = rospy.Timer(rospy.Duration(0.1), self.line_trajectory_callback)
        # self.circle_trajectory_timer = rospy.Timer(rospy.Duration(0.01), self.circle_trajectory_callback)

        # Line trajectory settings
        if self.trajectory_type == "line":
            self.start_p = np.array([20, 0, 100]) / 1000
            self.finish_p = np.array([20, 40, 100]) / 1000
            self.del_p = self.finish_p - self.start_p
            self.current_goal = self.start_p

        # Circle trajectory settings
        if self.trajectory_type == "circle":
            self.offset = np.array([20, 20, 100]) / 1000
            self.radius = 20.0 / 1000
            self.thetas = np.arange(0, 2 * np.pi, np.deg2rad(5))
            self.thetas_counter = 0
            self.start_p = self.offset
            self.current_goal = self.start_p

        # Start timer
        self.prev_time = rospy.get_time()

        # Complete trajectory check
        self.shape_df = pd.DataFrame(columns=[
            'episode', 'timestep', 'r1x', 'r1y', 'r1z', 'r2x', 'r2y', 'r2z',
            'r3x', 'r3y', 'r3z'
        ])
        # self.goals_df = pd.DataFrame(columns=['ag_x', 'ag_y', 'ag_z', 'dg_x', 'dg_y', 'dg_z'])
        self.traj_complete = False

        self.achieved_goals = np.array([])
        self.desired_goals = np.array([])
        self.episode_count = 0
예제 #11
0
def load_model(model_info, model_type="PPO", baseline=None, pkl_file=None):
    model_dir = os.path.join(model_info[0], model_info[1], model_info[2])
    if model_type == "PPO":
        if baseline == 'L2SP':
            from baselines.L2SP.model import PPO2L2SP
            import baselines.L2SP.utils as L2SP_utils
            data, params = L2SP_utils.load_from_file(model_dir)
            model = PPO2L2SP.load(model_dir, original_params=params)
        elif baseline == 'PNN':
            from baselines.PNN.utils import looseload, resave_params_for_PPN
            output_dir = os.path.join(
                "output/updated_gridworld_continuous_PNN", 'resave',
                model_info[2])
            resave_params_for_PPN(model_dir, output_dir)
            model = looseload(PPO2, output_dir)
        elif baseline == 'BSS':
            from baselines.BSS.utils import resave_params_for_BSS
            from baselines.BSS.model import PPO2BSS
            output_dir = os.path.join(
                "output/updated_gridworld_continuous_BSS", 'resave',
                model_info[2])
            resave_params_for_BSS(model_dir, output_dir)
            model = PPO2BSS.load(output_dir, bss_coef=0.001, l2_coef=0.0005)
        else:
            model = PPO2.load(model_dir)

    elif model_type == "HER":
        if baseline == 'L2SP':
            from baselines_fetch.L2SP.model import HER2L2SP
            import baselines_fetch.L2SP.utils as L2SP_utils
            data, params = L2SP_utils.load_from_file(model_dir)
            model = HER2L2SP.load(model_dir, original_params=params)
        elif baseline == "PNN":
            from baselines_fetch.PNN.model import HER2PNN
            from baselines_fetch.PNN.utils import resave_params_for_PNN
            output_dir = os.path.join("output/fetch_PNN", 'resave',
                                      model_info[2])
            resave_params_for_PNN(model_dir, output_dir)
            model = HER2PNN.load(output_dir)
        elif baseline == "BSS":
            pass
        else:
            model = HER.load(model_dir)
    return model
def main(load_policy=False):

    global log_dir, log_dir_policy
    if (load_policy):
          log_dir_policy = '../policies/PUSHING_TD3+HER_FIXED_POSITION_DYN_RAND_FROM_FIXED_PHYSICS'
    model_class = TD3  # works also with SAC and DDPG
    action_space = 7
    fixed = True
    normalize_observations = False
    gamma = 0.9
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 1500000
    discreteAction = 0
    rend = False
    env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0,
            isDiscrete=discreteAction, action_space = action_space,
            fixedPositionObj = fixed, includeVelObs = True)


    env = Monitor(env, log_dir, allow_early_resets=True)
    # Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    # Wrap the model

    model = HER(CustomPolicy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND", buffer_size=1000000,batch_size=256,
                random_exploration=0.3, action_noise=action_noise)

    if (load_policy):
        model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND_FROM_FIXED_PHYSICS",
        buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise)

    # Train the model starting from a previous policy
    model.learn(timesteps, callback = callback )
    model.save("../policies/PUSHING_FIXED_TD3_DYN_RAND")
    print("Finished train1")
def main(load_policy=False):
    global log_dir
    model_class = TD3  # works also with SAC and DDPG
    action_space = 6
    fixed = True
    #0 completely fixed, 1 slightly random radius, 2 big random radius,
    object_position = 1
    normalize_observations = False
    gamma = 0.9
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 5000000
    discreteAction = 0
    rend = False

    env = pandaPushGymEnvHER(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=1,
            isDiscrete=discreteAction, action_space = action_space,
            fixedPositionObj = fixed, includeVelObs = True, object_position=object_position)

    env = Monitor(env, log_dir, allow_early_resets=True)

    goal_selection_strategy = 'future'
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    # Wrap the model

    model = HER(CustomTD3Policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK", buffer_size=1000000,batch_size=256,
                random_exploration=0.3, action_noise=action_noise)

    if (load_policy):
        model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK",
        buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise)

    model.learn(timesteps,log_interval=100, callback = callback)
    print("Saving Policy PHASE_1")
    model.save("../policies/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK")
예제 #14
0
def main():
    panda_env = PandaGraspGymEnv(urdf_root=object_data.getDataPath(),
                                 is_rendering=True,
                                 use_ik=True,
                                 is_discrete=True,
                                 num_controlled_joints=7,
                                 reward_type="sparse")

    env = HERGoalEnvWrapper(panda_env)

    model = HER.load("logs/rl_model_1000000_steps.zip")

    episode_rewards, episode_lengths, episode_success = evaluate_policy(
        model,
        env,
        n_eval_episodes=50,
        render=False,
        deterministic=True,
        return_episode_rewards=True)
    print("Final Reward {}, Episode Length{}, Success Rate {}".format(
        np.mean(episode_rewards), np.mean(episode_lengths),
        np.mean(episode_success)))
예제 #15
0
 def __init__(self,
              env: ISettableGoalEnv,
              verbose=1,
              rank=0,
              experiment_name="her-sac"):
     self._env = env
     self._dirs = Dirs(
         experiment_name=f"{type(env).__name__}-{experiment_name}",
         rank=rank)
     options = {
         "env": env,
         "tensorboard_log": self._dirs.tensorboard,
         "model_class": SAC,
         "gamma": 1,
         "learning_rate": 3e-3
     }
     if os.path.isdir(self._dirs.models) and os.path.isfile(
             self._dirs.best_model):
         self._model = HER.load(load_path=self._dirs.best_model, **options)
         print(f"Loaded model {self._dirs.best_model}")
     else:
         self._model = HER(policy="MlpPolicy", verbose=verbose, **options)
예제 #16
0
    #env = SubprocVecEnv([make_env(env_id, log_dir, i+worker_id) for i in range(num_env)])
    model_class = DQN
    # Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE
    model = HER('MlpPolicy',
                env,
                model_class,
                n_sampled_goal=4,
                goal_selection_strategy=goal_selection_strategy,
                verbose=1)
    model.learn(total_timesteps=1000)
    model.save(log_dir + "model")

    # WARNING: you must pass an env
    # or wrap your environment with HERGoalEnvWrapper to use the predict method
    model = HER.load(log_dir + "model", env=env)

    #evaluate agent
    episodes = 100
    ep_r = []
    ep_l = []
    for e in range(episodes):
        obs = env.reset()
        total_r = 0.
        total_l = 0.
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, infos = env.step(action)
            total_l += 1.
            total_r += rewards
            if dones:
예제 #17
0
def launchAgent():
    import Reinforcement_AI.env.d_image_env as image_env
    from stable_baselines import DQN, HER, DDPG, PPO2
    from stable_baselines.common import make_vec_env

    model_name = "PPO2"

    if model_name == "HER":
        model = HER(
            "CnnPolicy",
            env=image_env.DetailedMiniMapEnv(),
            model_class=DQN
        )
    if model_name == "DDPG":
        model = DDPG(
            policy="CnnPolicy",
            env=image_env.DDPGImageEnv(),
            normalize_observations=True
        )
    if model_name == "PPO2":
        # env = image_env.DetailedMiniMapEnv()
        env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1)
        model = PPO2(
            policy="CnnPolicy",
            env=env,
            verbose=1
        )
    else:
        model = DQN(
            "CnnPolicy",  # policy
            env=image_env.DetailedMiniMapEnv(),  # environment
            double_q=True,  # Double Q enable
            prioritized_replay=True,  # Replay buffer enabled
            verbose=0  # log print
        )

    for i in range(1000):
        if i != 0:
            if model_name == "HER":
                model = HER.load("detailedmap_HER_" + str(i))
                model.set_env(image_env.DetailedMiniMapEnv())
            if model_name == "DDPG":
                model = DDPG.load("detailedmap_DDPG_" + str(i))
                model.set_env(image_env.DDPGImageEnv())
            if model_name == "PPO2":
                # print('set env')
                # ppo2_env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1)
                # print('get model')
                model = PPO2.load("detailedmap_PPO2_" + str(i), env)
                # print('set model env')
                # model.set_env(ppo2_env)
            else:
                model = DQN.load("detailedmap_DQN_" + str(i))
                model.set_env(image_env.DetailedMiniMapEnv())

        # print('model learn start')
        model.learn(total_timesteps=3900)
        # print('model learn finished')

        # print('model save start')
        model.save("detailedmap_" + model_name + "_" + str(i+1))
        del model
예제 #18
0
env, _ = load_env(env_name,
                  core_dir=core_dir,
                  envs_dir=envs_dir,
                  xmls_dir=xmls_dir,
                  return_args_remaining=True)
# Available strategies (cf paper): future, final, episode, random
goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

# Wrap the model
model = HER('MlpPolicy',
            env,
            model_class,
            n_sampled_goal=4,
            goal_selection_strategy=goal_selection_strategy,
            verbose=1)
# Train the model
model.learn(1000)

model.save("./her_bit_env")

# WARNING: you must pass an env
# or wrap your environment with HERGoalEnvWrapper to use the predict method
model = HER.load('./her_bit_env', env=env)

obs = env.reset()
for _ in range(100):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)

    if done:
        obs = env.reset()
            gamma=0.95,
            batch_size=256,
            ent_coef='auto',
            random_exploration=0.3,
            learning_starts=1000,
            train_freq=1,
            policy_kwargs=dict(layers=[256, 256, 256]),
            tensorboard_log="./OpenAI/")
# Train the model
model.learn(int(8e6))

model.save("./model2")

# WARNING: you must pass an env
# or wrap your environment with HERGoalEnvWrapper to use the predict method
model = HER.load('./model2', env=env)

obs = env.reset()
episodes = 0
successes = 0
step = 0
while (episodes < 50):
    step += 1
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)
    env.render()
    if done or step > 1000:
        obs = env.reset()
        episodes += 1
        if _['is_success']:
            successes += 1
예제 #20
0
                                            sigma=float(0.5) *
                                            np.ones(n_actions))

model = DDPG(MlpPolicy,
             env,
             verbose=1,
             param_noise=None,
             action_noise=action_noise)
# Train the model
model.learn(1000)

model.save("./hideandseek")

# WARNING: you must pass an env
# or wrap your environment with HERGoalEnvWrapper to use the predict method
model = HER.load('./hideandseek', env=env)

obs = env.reset()
for _ in range(100):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)

    if done:
        obs = env.reset()

#    print(main.__doc__)

#if __name__ == '__main__':
#    logging.getLogger('').handlers = []
#    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)
예제 #21
0
def evaluation(env_id, exp_id, model_path, num_episodes, output_path):
    env = HERGoalEnvWrapper(gym.make(env_id))
    model = HER.load(model_path, env=env)

    seed = np.random.randint(0, 10)
    set_global_seeds(seed)

    goal_errors = np.empty((num_episodes), dtype=float)
    B_errors = np.empty((num_episodes), dtype=float)
    alpha_errors = np.empty((num_episodes), dtype=float)
    q_B_achieved = np.empty((num_episodes, 3), dtype=float)
    q_alpha_achieved = np.empty((num_episodes, 3), dtype=float)
    q_B_desired = np.empty((num_episodes, 3), dtype=float)
    q_alpha_desired = np.empty((num_episodes, 3), dtype=float)
    desired_goals = np.empty((num_episodes, 3), dtype=float)
    achieved_goals = np.empty((num_episodes, 3), dtype=float)
    starting_positions = np.empty((num_episodes, 3), dtype=float)
    q_B_starting = np.empty((num_episodes, 3), dtype=float)
    q_alpha_starting = np.empty((num_episodes, 3), dtype=float)

    # TODO: pre-allocate memory
    for episode in range(num_episodes):
        print('episode: ', episode)
        # Run random episodes and save sequence of actions and states to plot in matlab
        episode_reward = 0
        ep_len = 0
        obs = env.reset()
        while True:
            action, _ = model.predict(obs, deterministic=True)
            action = np.clip(action, env.action_space.low,
                             env.action_space.high)
            obs, reward, done, infos = env.step(action)

            episode_reward += reward
            ep_len += 1

            if done or infos.get('is_success', False):
                goal_errors[episode] = infos.get('errors_pos')
                q_B_desired[episode, :] = infos.get('q_desired')[:3]
                q_alpha_desired[episode, :] = infos.get('q_desired')[3:]
                q_B_achieved[episode, :] = infos.get('q_achieved')[:3]
                q_alpha_achieved[episode, :] = infos.get('q_achieved')[3:]
                desired_goals[episode, :] = infos.get('desired_goal')
                achieved_goals[episode, :] = infos.get('achieved_goal')
                starting_positions[episode, :] = infos.get('starting_position')
                q_B_starting[episode, :] = infos.get('q_starting')[:3]
                q_alpha_starting[episode, :] = infos.get('q_starting')[3:]
                break

    print('mean_errors: ', np.mean(goal_errors))
    eval_df = pd.DataFrame(data=np.column_stack(
        (desired_goals, achieved_goals, starting_positions, q_B_desired,
         q_B_achieved, q_B_starting, q_alpha_desired, q_alpha_achieved,
         q_alpha_starting)),
                           columns=[
                               'desired_goal_x',
                               'desired_goal_y',
                               'desired_goal_z',
                               'achieved_goal_x',
                               'achieved_goal_y',
                               'achieved_goal_z',
                               'starting_position_x',
                               'starting_position_y',
                               'starting_position_z',
                               'B_desired_1',
                               'B_desired_2',
                               'B_desired_3',
                               'B_achieved_1',
                               'B_achieved_2',
                               'B_achieved_3',
                               'B_starting_1',
                               'B_starting_2',
                               'B_starting_3',
                               'alpha_desired_1',
                               'alpha_desired_2',
                               'alpha_desired_3',
                               'alpha_achieved_1',
                               'alpha_achieved_2',
                               'alpha_achieved_3',
                               'alpha_startin_1',
                               'alpha_starting_2',
                               'alpha_starting_3',
                           ])
    eval_df.to_csv(output_path)
model_class = DDPG

# Available strategies (cf paper): future, final, episode, random
goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE

# Wrap the model
model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,verbose=1)

# # Train the model
# model.learn(1000)

# model.save("her_fetch_reach_env")

# WARNING: you must pass an env
# or wrap your environment with HERGoalEnvWrapper to use the predict method
model = HER.load('her_fetch_reach_env', env=env)

obs = env.reset()
for _ in range(500):
    env.render()
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)

    if done:
        obs = env.reset()

# while not done:
#     env.render()
#     action, _ = model.predict(obs)
#     obs, reward, done, _ = env.step(action)
예제 #23
0
fixed = False
# -g
gamma = 0.9
# -b
batch_size = 256
# -m
memory_limit = 1000000
# -t
timesteps = 2000000

discreteAction = 0
rend = False

env = bioEnv()

goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE
# Wrap the model
model = HER.load("models/TD3/curriculum/best_model_part_11_10g_TRUE.pkl",
                 env=env)

obs = env.reset()
i = 0
for _ in range(10000):
    i += 1
    action, _ = model.predict(obs)
    print(action)
    obs, reward, done, _ = env.step(action)
    if done:
        print(str(i) + " " + str(done))
        obs = env.reset()
예제 #24
0
# -b
batch_size = 16
# -m
memory_limit = 1000000
# -r
normalize_returns = True
# -t
timesteps = 10000000
policy_name = "pushing_policy"
discreteAction = 0
rend = True

env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(),
                             renders=rend,
                             useIK=0,
                             isDiscrete=discreteAction,
                             numControlledJoints=numControlledJoints,
                             fixedPositionObj=fixed,
                             includeVelObs=True)

# Wrap the model
model = HER.load("../policies/pushing_fixed_HER_Dyn_Rand0.pkl", env=env)

obs = env.reset()

for i in range(10000):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)
    if done or i == 500:
        obs = env.reset()
예제 #25
0
fixed = False
# -o
normalize_observations = False
# -g
gamma = 0.9
# -b
batch_size = 64
# -m
memory_limit = 1000000
# -r
normalize_returns = False
# -t
timesteps = 1000000
policy_name = "pushing_policy"
discreteAction = 0
rend = False

env = bioEnv()

goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE
# Wrap the model
model = HER.load("models/TD3/policy_TD3_new.pkl", env=env)

obs = env.reset()

for _ in range(10000):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)
    if done:
        obs = env.reset()
예제 #26
0
import gym
import time

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2, DQN, HER, DDPG

import synergyenvs

env = gym.make("GraspBoxPybullet-v0")
env.render()
o = env.reset()

# model = PPO2(MlpPolicy, env, verbose=1)
model = HER('MlpPolicy', env, DDPG, n_sampled_goal=4, verbose=0)
model.load("./her_graspbox-1")

env.camera_adjust()

for _ in range(10000):
    env.render()
    action, _states = model.predict(o)
    # action = env.action_space.sample()
    o, r, done, info = env.step(action)
    print(o, r, done, info)
    if done:
        o = env.reset()
    time.sleep(0.1)

env.close()
memory_limit = 1000000
# -r
normalize_returns = True
# -t
timesteps = 100000
policy_name = "pushing_policy"
discreteAction = 0
rend = True

env = pandaPushGymEnvHER(urdfRoot=robot_data.getDataPath(),
                         renders=rend,
                         useIK=0,
                         isDiscrete=discreteAction,
                         action_space=action_space,
                         fixedPositionObj=fixed,
                         includeVelObs=True,
                         object_position=0,
                         test_phase=True)

goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE
# Wrap the model
model = HER.load("../policies/pushing_DDPG_HER_PHASE_1best_model.pkl", env=env)

obs = env.reset()

for _ in range(10000):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)
    if done:
        obs = env.reset()
예제 #28
0
# NOTE: it works even without action noise
# n_actions = env.action_space.shape[0]
# noise_std = 0.2
# action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions))
# model = HER('MlpPolicy', env, DDPG, n_sampled_goal=n_sampled_goal,
#             goal_selection_strategy='future',
#             verbose=1, buffer_size=int(1e6),
#             actor_lr=1e-3, critic_lr=1e-3, action_noise=action_noise,
#             gamma=0.95, batch_size=256,
#             policy_kwargs=dict(layers=[256, 256, 256]))

model.learn(int(2e5))
model.save('her_sac_highway')

# Load saved model
model = HER.load('her_sac_highway', env=env)

obs = env.reset()

# Evaluate the agent
episode_reward = 0
for _ in range(100):
    action, _ = model.predict(obs)
    obs, reward, done, info = env.step(action)
    env.render()
    episode_reward += reward
    if done or info.get('is_success', False):
        print("Reward:", episode_reward, "Success?",
              info.get('is_success', False))
        episode_reward = 0.0
        obs = env.reset()
normalize_observations = False
# -g
gamma = 0.9
# -b
batch_size = 16
# -m
memory_limit = 1000000
# -r
normalize_returns = True
# -t
timesteps = 1000000
policy_name = "pushing_policy"
discreteAction = 0
rend = True

env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0,
        isDiscrete=discreteAction, action_space = action_space,
        fixedPositionObj = fixed, includeVelObs = True, object_position=0, test_phase = True, alg = 'td3_normal_policy_to_different_physics', type_physics=2, max_episode_steps=500)

goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE
# Wrap the model
model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env)

obs = env.reset()

for _ in range(10000):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)
    if done:
        obs = env.reset()
# -r
normalize_returns = True
# -t
timesteps = 1000000
policy_name = "pushing_policy"
discreteAction = 0
rend = True

env = pandaPushGymEnvHER(urdfRoot=robot_data.getDataPath(),
                         renders=rend,
                         useIK=0,
                         isDiscrete=discreteAction,
                         action_space=action_space,
                         fixedPositionObj=fixed,
                         includeVelObs=True,
                         object_position=1)

goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE
# Wrap the model
model = HER.load(
    "../policies/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1best_model.pkl",
    env=env)

obs = env.reset()

for _ in range(10000):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)
    if done:
        obs = env.reset()