예제 #1
0
    def makeEnv(cls, args, env_kwargs=None, load_path_normalise=None):
        if "num_population" in args.__dict__:
            args.num_cpu = args.num_population * 2

        assert not (registered_env[args.env][3] is ThreadingType.NONE and args.num_cpu != 1), \
            "Error: cannot have more than 1 CPU for the environment {}".format(args.env)
        if env_kwargs is not None and env_kwargs.get("use_srl", False):
            srl_model = MultiprocessSRLModel(args.num_cpu, args.env,
                                             env_kwargs)
            env_kwargs["state_dim"] = srl_model.state_dim
            env_kwargs["srl_pipe"] = srl_model.pipe

        envs = [
            makeEnv(args.env,
                    args.seed,
                    i,
                    args.log_dir,
                    allow_early_resets=True,
                    env_kwargs=env_kwargs) for i in range(args.num_cpu)
        ]
        envs = SubprocVecEnv(envs)
        envs = VecFrameStack(envs, args.num_stack)
        if args.srl_model != "raw_pixels" and args.algo_type == "v2":
            envs = VecNormalize(envs, norm_obs=True, norm_reward=False)
            envs = loadRunningAverage(envs,
                                      load_path_normalise=load_path_normalise)
        return envs
예제 #2
0
파일: ddpg.py 프로젝트: s206283/gcrl
    def makeEnv(cls, args, env_kwargs=None, load_path_normalise=None):
        # Even though DeepQ is single core only, we need to use the pipe system to work
        if env_kwargs is not None and env_kwargs.get("use_srl", False):
            srl_model = MultiprocessSRLModel(1, args.env, env_kwargs)
            env_kwargs["state_dim"] = srl_model.state_dim
            env_kwargs["srl_pipe"] = srl_model.pipe

        env = DummyVecEnv([makeEnv(args.env, args.seed, 0, args.log_dir, env_kwargs=env_kwargs)])

        if args.srl_model != "raw_pixels":
            env = VecNormalize(env, norm_reward=False)
            env = loadRunningAverage(env, load_path_normalise=load_path_normalise)

        return env
예제 #3
0
    def makeEnv(cls, args, env_kwargs=None, load_path_normalise=None):
        # Even though SAC is single core only, we need to use the pipe system to work
        if env_kwargs is not None and env_kwargs.get("use_srl", False):
            srl_model = MultiprocessSRLModel(1, args.env, env_kwargs)
            env_kwargs["state_dim"] = srl_model.state_dim
            env_kwargs["srl_pipe"] = srl_model.pipe

        env = CustomDummyVecEnv([makeEnv(args.env, args.seed, 0, args.log_dir, env_kwargs=env_kwargs)])

        if args.srl_model != "raw_pixels":
            env = VecNormalize(env, norm_obs=True, norm_reward=False)
            env = loadRunningAverage(env, load_path_normalise=load_path_normalise)

        # Normalize only raw pixels
        # WARNING: when using framestacking, the memory used by the replay buffer can grow quickly
        return WrapFrameStack(env, args.num_stack, normalize=args.srl_model == "raw_pixels")
예제 #4
0
    def makeEnv(cls, args, env_kwargs=None, load_path_normalise=None):
        # Even though DeepQ is single core only, we need to use the pipe system to work
        if env_kwargs is not None and env_kwargs.get("use_srl", False):
            srl_model = MultiprocessSRLModel(1, args.env, env_kwargs)
            env_kwargs["state_dim"] = srl_model.state_dim
            env_kwargs["srl_pipe"] = srl_model.pipe

        envs = DummyVecEnv([makeEnv(args.env, args.seed, 0, args.log_dir, env_kwargs=env_kwargs)])
        envs = VecFrameStack(envs, args.num_stack)

        if args.srl_model != "raw_pixels":
            printYellow("Using MLP policy because working on state representation")
            envs = VecNormalize(envs, norm_obs=True, norm_reward=False)
            envs = loadRunningAverage(envs, load_path_normalise=load_path_normalise)

        return envs
예제 #5
0
def env_thread(args, thread_num, partition=True):
    """
    Run a session of an environment
    :param args: (ArgumentParser object)
    :param thread_num: (int) The thread ID of the environment session
    :param partition: (bool) If the output should be in multiple parts (default=True)
    """
    env_kwargs = {
        "max_distance": args.max_distance,
        "random_target": args.random_target,
        "force_down": True,
        "is_discrete": not args.continuous_actions,
        "renders": thread_num == 0 and args.display,
        "record_data": not args.no_record_data,
        "multi_view": args.multi_view,
        "save_path": args.save_path,
        "shape_reward": args.shape_reward,
        "simple_continual_target": args.simple_continual,
        "circular_continual_move": args.circular_continual,
        "square_continual_move": args.square_continual,
        "short_episodes": args.short_episodes
    }

    if partition:
        env_kwargs["name"] = args.name + "_part-" + str(thread_num)
    else:
        env_kwargs["name"] = args.name

    load_path, train_args, algo_name, algo_class = None, None, None, None
    model = None
    srl_model = None
    srl_state_dim = 0
    generated_obs = None
    env_norm = None

    if args.run_policy in ["walker", "custom"]:
        if args.latest:
            args.log_dir = latestPath(args.log_custom_policy)
        else:
            args.log_dir = args.log_custom_policy
        args.render = args.display
        args.plotting, args.action_proba = False, False

        train_args, load_path, algo_name, algo_class, _, env_kwargs_extra = loadConfigAndSetup(
            args)
        env_kwargs["srl_model"] = env_kwargs_extra["srl_model"]
        env_kwargs["random_target"] = env_kwargs_extra.get(
            "random_target", False)
        env_kwargs["use_srl"] = env_kwargs_extra.get("use_srl", False)

        # TODO REFACTOR
        env_kwargs["simple_continual_target"] = env_kwargs_extra.get(
            "simple_continual_target", False)
        env_kwargs["circular_continual_move"] = env_kwargs_extra.get(
            "circular_continual_move", False)
        env_kwargs["square_continual_move"] = env_kwargs_extra.get(
            "square_continual_move", False)
        env_kwargs["eight_continual_move"] = env_kwargs_extra.get(
            "eight_continual_move", False)

        eps = 0.2
        env_kwargs["state_init_override"] = np.array([MIN_X + eps, MAX_X - eps]) \
            if args.run_policy == 'walker' else None
        if env_kwargs["use_srl"]:
            env_kwargs["srl_model_path"] = env_kwargs_extra.get(
                "srl_model_path", None)
            env_kwargs["state_dim"] = getSRLDim(
                env_kwargs_extra.get("srl_model_path", None))
            srl_model = MultiprocessSRLModel(num_cpu=args.num_cpu,
                                             env_id=args.env,
                                             env_kwargs=env_kwargs)
            env_kwargs["srl_pipe"] = srl_model.pipe

    env_class = registered_env[args.env][0]
    env = env_class(**env_kwargs)

    if env_kwargs.get('srl_model', None) not in ["raw_pixels", None]:
        # TODO: Remove env duplication
        # This is a dirty trick to normalize the obs.
        # So for as we override SRL environment functions (step, reset) for on-policy generation & generative replay
        # using stable-baselines' normalisation wrappers (step & reset) breaks...
        env_norm = [
            makeEnv(args.env,
                    args.seed,
                    i,
                    args.log_dir,
                    allow_early_resets=False,
                    env_kwargs=env_kwargs) for i in range(args.num_cpu)
        ]
        env_norm = DummyVecEnv(env_norm)
        env_norm = VecNormalize(env_norm, norm_obs=True, norm_reward=False)
        env_norm = loadRunningAverage(
            env_norm, load_path_normalise=args.log_custom_policy)
    using_real_omnibot = args.env == "OmnirobotEnv-v0" and USING_OMNIROBOT

    walker_path = None
    action_walker = None
    state_init_for_walker = None
    kwargs_reset, kwargs_step = {}, {}

    if args.run_policy in ['custom', 'ppo2', 'walker']:
        # Additional env when using a trained agent to generate data
        train_env = vecEnv(env_kwargs, env_class)

        if args.run_policy == 'ppo2':
            model = PPO2(CnnPolicy, train_env).learn(args.ppo2_timesteps)
        else:
            _, _, algo_args = createEnv(args, train_args, algo_name,
                                        algo_class, env_kwargs)
            tf.reset_default_graph()
            set_global_seeds(args.seed % 2 ^ 32)
            printYellow("Compiling Policy function....")
            model = algo_class.load(load_path, args=algo_args)
            if args.run_policy == 'walker':
                walker_path = walkerPath()

    if len(args.replay_generative_model) > 0:
        srl_model = loadSRLModel(args.log_generative_model,
                                 th.cuda.is_available())
        srl_state_dim = srl_model.state_dim
        srl_model = srl_model.model.model

    frames = 0
    start_time = time.time()

    # divide evenly, then do an extra one for only some of them in order to get the right count
    for i_episode in range(args.num_episode // args.num_cpu + 1 *
                           (args.num_episode % args.num_cpu > thread_num)):

        # seed + position in this slice + size of slice (with reminder if uneven partitions)
        seed = args.seed + i_episode + args.num_episode // args.num_cpu * thread_num + \
               (thread_num if thread_num <= args.num_episode % args.num_cpu else args.num_episode % args.num_cpu)
        seed = seed % 2 ^ 32
        if not (args.run_policy in ['custom', 'walker']):
            env.seed(seed)
            env.action_space.seed(
                seed)  # this is for the sample() function from gym.space

        if len(args.replay_generative_model) > 0:

            sample = Variable(th.randn(1, srl_state_dim))
            if th.cuda.is_available():
                sample = sample.cuda()

            generated_obs = srl_model.decode(sample)
            generated_obs = generated_obs[0].detach().cpu().numpy()
            generated_obs = deNormalize(generated_obs)

            kwargs_reset['generated_observation'] = generated_obs
        obs = env.reset(**kwargs_reset)
        done = False
        action_proba = None
        t = 0
        episode_toward_target_on = False

        while not done:

            env.render()

            # Policy to run on the fly - to be trained before generation
            if args.run_policy == 'ppo2':
                action, _ = model.predict([obs])

            # Custom pre-trained Policy (SRL or End-to-End)
            elif args.run_policy in ['custom', 'walker']:
                obs = env_norm._normalize_observation(obs)
                action = [model.getAction(obs, done)]
                action_proba = model.getActionProba(obs, done)
                if args.run_policy == 'walker':
                    action_walker = np.array(walker_path[t])
            # Random Policy
            else:
                # Using a target reaching policy (untrained, from camera) when collecting data from real OmniRobot
                if episode_toward_target_on and np.random.rand() < args.toward_target_timesteps_proportion and \
                        using_real_omnibot:
                    action = [env.actionPolicyTowardTarget()]
                else:
                    action = [env.action_space.sample()]

            # Generative replay +/- for on-policy action
            if len(args.replay_generative_model) > 0:

                if args.run_policy == 'custom':
                    obs = obs.reshape(1, srl_state_dim)
                    obs = th.from_numpy(obs.astype(np.float32)).cuda()
                    z = obs
                    generated_obs = srl_model.decode(z)
                else:
                    sample = Variable(th.randn(1, srl_state_dim))

                    if th.cuda.is_available():
                        sample = sample.cuda()

                    generated_obs = srl_model.decode(sample)
                generated_obs = generated_obs[0].detach().cpu().numpy()
                generated_obs = deNormalize(generated_obs)

            action_to_step = action[0]
            kwargs_step = {
                k: v
                for (k, v) in [("generated_observation",
                                generated_obs), ("action_proba", action_proba),
                               ("action_grid_walker", action_walker)]
                if v is not None
            }

            obs, _, done, _ = env.step(action_to_step, **kwargs_step)

            frames += 1
            t += 1
            if done:
                if np.random.rand(
                ) < args.toward_target_timesteps_proportion and using_real_omnibot:
                    episode_toward_target_on = True
                else:
                    episode_toward_target_on = False
                print("Episode finished after {} timesteps".format(t + 1))

        if thread_num == 0:
            print("{:.2f} FPS".format(frames * args.num_cpu /
                                      (time.time() - start_time)))