Пример #1
0
    def step(self, act: np.ndarray) -> tuple:
        # Clip the action according to the maximum step length
        action = np.clip(act, -1, 1) * self.svpg_max_step_length

        # Perform step by moving into direction of action
        self.svpg_state = np.clip(self.svpg_state + action, 0, 1)
        param_norm = self.svpg_state + 0.5
        rand_eval_params = [self.array_to_dict(param_norm * self.nominal())
                            ] * self.num_trajs
        norm_eval_params = [self.nominal_dict()] * self.num_trajs
        rand = eval_domain_params(self.pool, self.wrapped_env,
                                  self.inner_policy, rand_eval_params)
        ref = eval_domain_params(self.pool, self.wrapped_env,
                                 self.inner_policy, norm_eval_params)
        rewards = [self.discriminator.get_reward(traj) for traj in rand]
        reward = np.mean(rewards)
        info = dict(rand=rand, ref=ref)
        if self.count >= self.max_steps - 1:
            done = True
        else:
            done = False
        self.count += 1
        self.horizon_count += 1
        if self.horizon_count >= self.horizon:
            self.horizon_count = 0
            self.svpg_state = np.random.random_sample(len(self.parameters))

        return self.svpg_state, reward, done, info
Пример #2
0
    def eval_states(self, states: Sequence[np.ndarray]):
        """
        Evaluate the states.

        :param states: the states to evaluate
        :return: respective rewards and according trajectories
        """
        flatten = lambda l: [item for sublist in l for item in sublist]
        sstates = flatten([
            [self.parameters.array_to_dict((state + 0.5)*self.parameters.nominal)]*self.num_trajs
            for state in states]
        )
        rand = eval_domain_params(self.pool, self.wrapped_env, self.inner_policy, sstates)
        ref = eval_domain_params(self.pool, self.wrapped_env, self.inner_policy,
                                 [self.parameters.nominal_dict]*(self.num_trajs*len(states)))
        rewards = [self.discriminator.get_reward(traj) for traj in rand]
        rewards = [np.mean(rewards[i*self.num_trajs:(i + 1)*self.num_trajs]) for i in range(len(states))]
        return rewards, rand, ref
Пример #3
0
    for i, policy in enumerate(policies):
        # Create a new sampler pool for every policy to synchronize the random seeds i.e. init states
        pool = SamplerPool(args.num_envs)

        # Seed the sampler
        if args.seed is not None:
            pool.set_seed(args.seed)
            print_cbt(f'Set seed to {args.seed}', 'y')
        else:
            print_cbt('No seed was set', 'r', bright=True)

        # Add an action normalization wrapper if the policy was trained with one
        env = conditional_actnorm_wrapper(env, ex_dirs, i)

        # Sample rollouts
        ros = eval_domain_params(pool, env, policy, param_list, init_state)

        # Compute results metrics
        rets = [ro.undiscounted_return() for ro in ros]
        lengths = [float(ro.length) for ro in ros]  # int values are not numeric in pandas
        vaired_param_values = [ro.rollout_info['domain_param'][varied_param_key] for ro in ros]
        varied_param = {varied_param_key: vaired_param_values}
        df = df.append(pd.DataFrame(dict(policy=exp_labels[i], ret=rets, len=lengths, **varied_param)),
                       ignore_index=True)

    metrics = dict(
        avg_len=df.groupby('policy').mean()['len'].to_dict(),
        avg_ret=df.groupby('policy').mean()['ret'].to_dict(),
        median_ret=df.groupby('policy').median()['ret'].to_dict(),
        min_ret=df.groupby('policy').min()['ret'].to_dict(),
        max_ret=df.groupby('policy').max()['ret'].to_dict(),
Пример #4
0
def evaluate_policy(args, ex_dir):
    """Helper function to evaluate the policy from an experiment in the associated environment."""
    env, policy, _ = load_experiment(ex_dir, args)

    # Create multi-dim evaluation grid
    param_spec = dict()
    param_spec_dim = None

    if isinstance(inner_env(env), BallOnPlateSim):
        param_spec["ball_radius"] = np.linspace(0.02, 0.08, num=2, endpoint=True)
        param_spec["ball_rolling_friction_coefficient"] = np.linspace(0.0295, 0.9, num=2, endpoint=True)

    elif isinstance(inner_env(env), QQubeSwingUpSim):
        eval_num = 200
        # Use nominal values for all other parameters.
        for param, nominal_value in env.get_nominal_domain_param().items():
            param_spec[param] = nominal_value
        # param_spec["gravity_const"] = np.linspace(5.0, 15.0, num=eval_num, endpoint=True)
        param_spec["damping_pend_pole"] = np.linspace(0.0, 0.0001, num=eval_num, endpoint=True)
        param_spec["damping_rot_pole"] = np.linspace(0.0, 0.0006, num=eval_num, endpoint=True)
        param_spec_dim = 2

    elif isinstance(inner_env(env), QBallBalancerSim):
        # param_spec["gravity_const"] = np.linspace(7.91, 11.91, num=11, endpoint=True)
        # param_spec["ball_mass"] = np.linspace(0.003, 0.3, num=11, endpoint=True)
        # param_spec["ball_radius"] = np.linspace(0.01, 0.1, num=11, endpoint=True)
        param_spec["plate_length"] = np.linspace(0.275, 0.275, num=11, endpoint=True)
        param_spec["arm_radius"] = np.linspace(0.0254, 0.0254, num=11, endpoint=True)
        # param_spec["load_inertia"] = np.linspace(5.2822e-5*0.5, 5.2822e-5*1.5, num=11, endpoint=True)
        # param_spec["motor_inertia"] = np.linspace(4.6063e-7*0.5, 4.6063e-7*1.5, num=11, endpoint=True)
        # param_spec["gear_ratio"] = np.linspace(60, 80, num=11, endpoint=True)
        # param_spec["gear_efficiency"] = np.linspace(0.6, 1.0, num=11, endpoint=True)
        # param_spec["motor_efficiency"] = np.linspace(0.49, 0.89, num=11, endpoint=True)
        # param_spec["motor_back_emf"] = np.linspace(0.006, 0.066, num=11, endpoint=True)
        # param_spec["motor_resistance"] = np.linspace(2.6*0.5, 2.6*1.5, num=11, endpoint=True)
        # param_spec["combined_damping"] = np.linspace(0.0, 0.05, num=11, endpoint=True)
        # param_spec["friction_coeff"] = np.linspace(0, 0.015, num=11, endpoint=True)
        # param_spec["voltage_thold_x_pos"] = np.linspace(0.0, 1.0, num=11, endpoint=True)
        # param_spec["voltage_thold_x_neg"] = np.linspace(-1., 0.0, num=11, endpoint=True)
        # param_spec["voltage_thold_y_pos"] = np.linspace(0.0, 1.0, num=11, endpoint=True)
        # param_spec["voltage_thold_y_neg"] = np.linspace(-1.0, 0, num=11, endpoint=True)
        # param_spec["offset_th_x"] = np.linspace(-5/180*np.pi, 5/180*np.pi, num=11, endpoint=True)
        # param_spec["offset_th_y"] = np.linspace(-5/180*np.pi, 5/180*np.pi, num=11, endpoint=True)

    else:
        raise NotImplementedError

    # Always add an action delay wrapper (with 0 delay by default)
    if typed_env(env, ActDelayWrapper) is None:
        env = ActDelayWrapper(env)
    # param_spec['act_delay'] = np.linspace(0, 30, num=11, endpoint=True, dtype=int)

    add_info = "-".join(param_spec.keys())

    # Create multidimensional results grid and ensure right number of rollouts
    param_list = param_grid(param_spec)
    param_list *= args.num_rollouts_per_config

    # Fix initial state (set to None if it should not be fixed)
    init_state = np.array([0.0, 0.0, 0.0, 0.0])

    # Create sampler
    pool = SamplerPool(args.num_workers)
    if args.seed is not None:
        pool.set_seed(args.seed)
        print_cbt(f"Set the random number generators' seed to {args.seed}.", "w")
    else:
        print_cbt("No seed was set", "y")

    # Sample rollouts
    ros = eval_domain_params(pool, env, policy, param_list, init_state)

    # Compute metrics
    lod = []
    for ro in ros:
        d = dict(**ro.rollout_info["domain_param"], ret=ro.undiscounted_return(), len=ro.length)
        # Simply remove the observation noise from the domain parameters
        try:
            d.pop("obs_noise_mean")
            d.pop("obs_noise_std")
        except KeyError:
            pass
        lod.append(d)

    df = pd.DataFrame(lod)
    metrics = dict(
        avg_len=df["len"].mean(),
        avg_ret=df["ret"].mean(),
        median_ret=df["ret"].median(),
        min_ret=df["ret"].min(),
        max_ret=df["ret"].max(),
        std_ret=df["ret"].std(),
    )
    pprint(metrics, indent=4)

    # Create subfolder and save
    timestamp = datetime.datetime.now()
    add_info = timestamp.strftime(pyrado.timestamp_format) + "--" + add_info
    save_dir = osp.join(ex_dir, "eval_domain_grid", add_info)
    os.makedirs(save_dir, exist_ok=True)

    save_dicts_to_yaml(
        {"ex_dir": str(ex_dir)},
        {"varied_params": list(param_spec.keys())},
        {"num_rpp": args.num_rollouts_per_config, "seed": args.seed},
        {"metrics": dict_arraylike_to_float(metrics)},
        save_dir=save_dir,
        file_name="summary",
    )
    pyrado.save(df, f"df_sp_grid_{len(param_spec) if param_spec_dim is None else param_spec_dim}d.pkl", save_dir)