示例#1
0
def test_BaseTClassifier(generate_classification_data):

    np.random.seed(RANDOM_SEED)

    df, x_names = generate_classification_data()

    df['treatment_group_key'] = np.where(df['treatment_group_key'] == CONTROL_NAME, 0, 1)

    df_train, df_test = train_test_split(df,
                                         test_size=0.2,
                                         random_state=RANDOM_SEED)

    uplift_model = BaseTClassifier(learner=LogisticRegression())

    uplift_model.fit(X=df_train[x_names].values,
                     treatment=df_train['treatment_group_key'].values,
                     y=df_train[CONVERSION].values)

    tau_pred = uplift_model.predict(X=df_test[x_names].values,
                                  treatment=df_test['treatment_group_key'].values)

    auuc_metrics = pd.DataFrame({'tau_pred': tau_pred.flatten(),
                                 'W': df_test['treatment_group_key'].values,
                                 CONVERSION: df_test[CONVERSION].values,
                                 'treatment_effect_col': df_test['treatment_effect'].values})

    cumgain = get_cumgain(auuc_metrics,
                          outcome_col=CONVERSION,
                          treatment_col='W',
                          treatment_effect_col='treatment_effect_col')

    # Check if the cumulative gain when using the model's prediction is
    # higher than it would be under random targeting
    assert cumgain['tau_pred'].sum() > cumgain['Random'].sum()
def test_counterfactual_value_optimization():

    df, X_names = make_uplift_classification(
        n_samples=2000, treatment_name=['control', 'treatment1', 'treatment2'])
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)

    train_idx = df_train.index
    test_idx = df_test.index

    conversion_cost_dict = {'control': 0, 'treatment1': 2.5, 'treatment2': 5}
    impression_cost_dict = {'control': 0, 'treatment1': 0, 'treatment2': 0.02}

    cc_array, ic_array, conditions = get_treatment_costs(treatment=df['treatment_group_key'],
                                                         control_name='control',
                                                         cc_dict=conversion_cost_dict,
                                                         ic_dict=impression_cost_dict)
    conversion_value_array = np.full(df.shape[0], 20)

    actual_value = get_actual_value(treatment=df['treatment_group_key'],
                                    observed_outcome=df['conversion'],
                                    conversion_value=conversion_value_array,
                                    conditions=conditions,
                                    conversion_cost=cc_array,
                                    impression_cost=ic_array)

    random_allocation_value = actual_value.loc[test_idx].mean()

    tm = BaseTClassifier(learner=LogisticRegression(), control_name='control')
    tm.fit(df_train[X_names].values, df_train['treatment_group_key'], df_train['conversion'])
    tm_pred = tm.predict(df_test[X_names].values)

    proba_model = LogisticRegression()

    W_dummies = pd.get_dummies(df['treatment_group_key'])
    XW = np.c_[df[X_names], W_dummies]
    proba_model.fit(XW[train_idx], df_train['conversion'])
    y_proba = proba_model.predict_proba(XW[test_idx])[:, 1]

    cve = CounterfactualValueEstimator(treatment=df_test['treatment_group_key'],
                                       control_name='control',
                                       treatment_names=conditions[1:],
                                       y_proba=y_proba,
                                       cate=tm_pred,
                                       value=conversion_value_array[test_idx],
                                       conversion_cost=cc_array[test_idx],
                                       impression_cost=ic_array[test_idx])

    cve_best_idx = cve.predict_best()
    cve_best = [conditions[idx] for idx in cve_best_idx]
    actual_is_cve_best = df.loc[test_idx, 'treatment_group_key'] == cve_best
    cve_value = actual_value.loc[test_idx][actual_is_cve_best].mean()

    assert cve_value > random_allocation_value