Python NNPolicyLearner示例，obp.policy.offline.NNPolicyLearner Python示例

示例#1

0

显示文件

文件： test_offline.py 项目： zwcdp/zr-obp

def test_nn_policy_learner_fit():
    context = np.ones((100, 2), dtype=np.float32)
    action = np.zeros((100, ), dtype=int)
    reward = np.ones((100, ), dtype=np.float32)
    pscore = np.array([0.5] * 100, dtype=np.float32)
    ipw = InverseProbabilityWeighting()

    # inconsistency with the shape
    with pytest.raises(ValueError):
        learner = NNPolicyLearner(
            n_actions=2,
            dim_context=2,
            off_policy_objective=ipw.estimate_policy_value_tensor,
        )
        variant_context = np.ones((101, 2), dtype=np.float32)
        learner.fit(context=variant_context,
                    action=action,
                    reward=reward,
                    pscore=pscore)

    # inconsistency between dim_context and context
    with pytest.raises(ValueError):
        learner = NNPolicyLearner(
            n_actions=2,
            dim_context=3,
            off_policy_objective=ipw.estimate_policy_value_tensor,
        )
        learner.fit(context=context,
                    action=action,
                    reward=reward,
                    pscore=pscore)

示例#2

0

显示文件

文件： test_offline.py 项目： aiueola/zr-obp

def test_nn_policy_learner_fit():
    context = np.ones((100, 2), dtype=np.float32)
    action = np.zeros((100, ), dtype=int)
    reward = np.ones((100, ), dtype=np.float32)
    pscore = np.array([0.5] * 100, dtype=np.float32)

    # inconsistency with the shape
    desc = "Expected `context.shape[0]"
    with pytest.raises(ValueError, match=f"{desc}*"):
        learner = NNPolicyLearner(n_actions=2,
                                  dim_context=2,
                                  off_policy_objective="ipw")
        variant_context = np.ones((101, 2), dtype=np.float32)
        learner.fit(context=variant_context,
                    action=action,
                    reward=reward,
                    pscore=pscore)

    # inconsistency between dim_context and context
    desc = "Expected `context.shape[1]"
    with pytest.raises(ValueError, match=f"{desc}*"):
        learner = NNPolicyLearner(n_actions=2,
                                  dim_context=3,
                                  off_policy_objective="ipw")
        learner.fit(context=context,
                    action=action,
                    reward=reward,
                    pscore=pscore)

示例#3

0

显示文件

文件： test_offline.py 项目： aiueola/zr-obp

def test_nn_policy_learner_init_using_invalid_inputs(
    n_actions,
    len_list,
    dim_context,
    off_policy_objective,
    lambda_,
    policy_reg_param,
    var_reg_param,
    hidden_layer_size,
    activation,
    solver,
    alpha,
    batch_size,
    learning_rate_init,
    max_iter,
    shuffle,
    random_state,
    tol,
    momentum,
    nesterovs_momentum,
    early_stopping,
    validation_fraction,
    beta_1,
    beta_2,
    epsilon,
    n_iter_no_change,
    err,
    description,
):
    with pytest.raises(err, match=f"{description}*"):
        _ = NNPolicyLearner(
            n_actions=n_actions,
            len_list=len_list,
            dim_context=dim_context,
            off_policy_objective=off_policy_objective,
            lambda_=lambda_,
            policy_reg_param=policy_reg_param,
            var_reg_param=var_reg_param,
            hidden_layer_size=hidden_layer_size,
            activation=activation,
            solver=solver,
            alpha=alpha,
            batch_size=batch_size,
            learning_rate_init=learning_rate_init,
            max_iter=max_iter,
            shuffle=shuffle,
            random_state=random_state,
            tol=tol,
            momentum=momentum,
            nesterovs_momentum=nesterovs_momentum,
            early_stopping=early_stopping,
            validation_fraction=validation_fraction,
            beta_1=beta_1,
            beta_2=beta_2,
            epsilon=epsilon,
            n_iter_no_change=n_iter_no_change,
        )

示例#4

0

显示文件

文件： test_offline.py 项目： zwcdp/zr-obp

def test_nn_policy_learner_init_using_valid_inputs(
    n_actions,
    len_list,
    dim_context,
    off_policy_objective,
    hidden_layer_size,
    activation,
    solver,
    alpha,
    batch_size,
    learning_rate_init,
    max_iter,
    shuffle,
    random_state,
    tol,
    momentum,
    nesterovs_momentum,
    early_stopping,
    validation_fraction,
    beta_1,
    beta_2,
    epsilon,
    n_iter_no_change,
    max_fun,
    description,
):
    nn_policy_learner = NNPolicyLearner(
        n_actions=n_actions,
        len_list=len_list,
        dim_context=dim_context,
        off_policy_objective=off_policy_objective,
        hidden_layer_size=hidden_layer_size,
        activation=activation,
        solver=solver,
        alpha=alpha,
        batch_size=batch_size,
        learning_rate_init=learning_rate_init,
        max_iter=max_iter,
        shuffle=shuffle,
        random_state=random_state,
        tol=tol,
        momentum=momentum,
        nesterovs_momentum=nesterovs_momentum,
        early_stopping=early_stopping,
        validation_fraction=validation_fraction,
        beta_1=beta_1,
        beta_2=beta_2,
        epsilon=epsilon,
        n_iter_no_change=n_iter_no_change,
        max_fun=max_fun,
    )
    assert isinstance(nn_policy_learner, NNPolicyLearner)

示例#5

0

显示文件

文件： test_offline.py 项目： zwcdp/zr-obp

def test_nn_policy_learner_create_train_data_for_opl():
    context = np.ones((100, 2), dtype=np.int32)
    action = np.zeros((100, ), dtype=np.int32)
    reward = np.ones((100, ), dtype=np.float32)
    pscore = np.array([0.5] * 100, dtype=np.float32)
    estimated_rewards_by_reg_model = np.ones((100, 2), dtype=np.float32)
    position = np.zeros((100, ), dtype=np.int32)
    ipw = InverseProbabilityWeighting()

    learner1 = NNPolicyLearner(
        n_actions=2,
        dim_context=2,
        off_policy_objective=ipw.estimate_policy_value_tensor,
    )

    training_loader, validation_loader = learner1._create_train_data_for_opl(
        context=context,
        action=action,
        reward=reward,
        pscore=pscore,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        position=position,
    )

    assert isinstance(training_loader, torch.utils.data.DataLoader)
    assert validation_loader is None

    learner2 = NNPolicyLearner(
        n_actions=2,
        dim_context=2,
        off_policy_objective=ipw.estimate_policy_value_tensor,
        early_stopping=True,
    )

    training_loader, validation_loader = learner2._create_train_data_for_opl(
        context=context,
        action=action,
        reward=reward,
        pscore=pscore,
        estimated_rewards_by_reg_model=estimated_rewards_by_reg_model,
        position=position,
    )

    assert isinstance(training_loader, torch.utils.data.DataLoader)
    assert isinstance(validation_loader, torch.utils.data.DataLoader)

示例#6

0

显示文件

文件： test_offline.py 项目： zwcdp/zr-obp

def test_nn_policy_learner_predict_proba():
    n_actions = 2
    len_list = 1
    context = np.ones((100, 2), dtype=np.float32)
    context_test = np.array([i for i in range(10)],
                            dtype=np.float32).reshape(5, 2)
    action = np.zeros((100, ), dtype=int)
    reward = np.ones((100, ), dtype=np.float32)
    pscore = np.array([0.5] * 100, dtype=np.float32)
    ipw = InverseProbabilityWeighting()

    # shape error
    with pytest.raises(ValueError):
        learner = NNPolicyLearner(
            n_actions=n_actions,
            len_list=len_list,
            dim_context=2,
            off_policy_objective=ipw.estimate_policy_value_tensor,
        )
        learner.fit(context=context,
                    action=action,
                    reward=reward,
                    pscore=pscore)
        invalid_context = np.array([1.0, 1.0], dtype=np.float32)
        learner.predict_proba(context=invalid_context)

    # inconsistency between dim_context and context
    with pytest.raises(ValueError):
        learner = NNPolicyLearner(
            n_actions=n_actions,
            len_list=len_list,
            dim_context=2,
            off_policy_objective=ipw.estimate_policy_value_tensor,
        )
        learner.fit(context=context,
                    action=action,
                    reward=reward,
                    pscore=pscore)
        invalid_context = np.array([[1.0, 1.0, 1.0]], dtype=np.float32)
        learner.predict_proba(context=invalid_context)

    learner = NNPolicyLearner(
        n_actions=n_actions,
        len_list=len_list,
        dim_context=2,
        off_policy_objective=ipw.estimate_policy_value_tensor,
    )
    learner.fit(context=context, action=action, reward=reward, pscore=pscore)
    action_dist = learner.predict_proba(context=context_test)
    assert np.allclose(action_dist.sum(1),
                       np.ones_like((context_test.shape[0], len_list)))
    assert action_dist.shape[0] == context_test.shape[0]
    assert action_dist.shape[1] == n_actions
    assert action_dist.shape[2] == len_list

示例#7

0

显示文件

文件： test_offline.py 项目： aiueola/zr-obp

def test_nn_policy_learner_predict_proba():
    n_actions = 2
    len_list = 1
    context = np.ones((100, 2), dtype=np.float32)
    context_test = np.array([i for i in range(10)],
                            dtype=np.float32).reshape(5, 2)
    action = np.zeros((100, ), dtype=int)
    reward = np.ones((100, ), dtype=np.float32)
    pscore = np.array([0.5] * 100, dtype=np.float32)

    # shape error
    desc = "`context` must be 2D array"
    with pytest.raises(ValueError, match=f"{desc}*"):
        learner = NNPolicyLearner(
            n_actions=n_actions,
            len_list=len_list,
            dim_context=2,
            off_policy_objective="ipw",
        )
        learner.fit(context=context,
                    action=action,
                    reward=reward,
                    pscore=pscore)
        invalid_context = np.array([1.0, 1.0], dtype=np.float32)
        learner.predict_proba(context=invalid_context)

    # inconsistency between dim_context and context
    desc = "Expected `context.shape[1]"
    with pytest.raises(ValueError, match=f"{desc}*"):
        learner = NNPolicyLearner(
            n_actions=n_actions,
            len_list=len_list,
            dim_context=2,
            off_policy_objective="ipw",
        )
        learner.fit(context=context,
                    action=action,
                    reward=reward,
                    pscore=pscore)
        invalid_context = np.array([[1.0, 1.0, 1.0]], dtype=np.float32)
        learner.predict_proba(context=invalid_context)

    learner = NNPolicyLearner(
        n_actions=n_actions,
        len_list=len_list,
        dim_context=2,
        off_policy_objective="ipw",
    )
    learner.fit(context=context, action=action, reward=reward, pscore=pscore)
    action_dist = learner.predict_proba(context=context_test)
    assert np.allclose(action_dist.sum(1),
                       np.ones_like((context_test.shape[0], len_list)))
    assert action_dist.shape[0] == context_test.shape[0]
    assert action_dist.shape[1] == n_actions
    assert action_dist.shape[2] == len_list