def test_save_load_no_data_replace_with_empty_str(self, cdnow_customers): """Test saving and loading model for BG/NBD without data with replaced value empty str.""" bgf = lt.BetaGeoFitter(penalizer_coef=0.0) bgf.fit(cdnow_customers["frequency"], cdnow_customers["recency"], cdnow_customers["T"]) bgf.save_model(PATH_SAVE_BGNBD_MODEL, save_data=False, values_to_save=[""]) bgf_new = lt.BetaGeoFitter() bgf_new.load_model(PATH_SAVE_BGNBD_MODEL) assert bgf_new.__dict__["penalizer_coef"] == bgf.__dict__[ "penalizer_coef"] assert bgf_new.__dict__["_scale"] == bgf.__dict__["_scale"] assert bgf_new.__dict__["params_"].equals(bgf.__dict__["params_"]) assert bgf_new.__dict__["_negative_log_likelihood_"] == bgf.__dict__[ "_negative_log_likelihood_"] assert bgf_new.__dict__["predict"](1, 1, 2, 5) == bgf.__dict__["predict"](1, 1, 2, 5) assert bgf_new.expected_number_of_purchases_up_to_time( 1) == bgf.expected_number_of_purchases_up_to_time(1) assert bgf_new.__dict__["data"] is "" # remove saved model os.remove(PATH_SAVE_BGNBD_MODEL)
def test_using_weights_col_gives_correct_results(self, cdnow_customers): cdnow_customers_weights = cdnow_customers.copy() cdnow_customers_weights["weights"] = 1.0 cdnow_customers_weights = cdnow_customers_weights.groupby( ["frequency", "recency", "T"])["weights"].sum() cdnow_customers_weights = cdnow_customers_weights.reset_index() assert (cdnow_customers_weights["weights"] > 1).any() bgf_weights = lt.BetaGeoFitter(penalizer_coef=0.0) bgf_weights.fit( cdnow_customers_weights["frequency"], cdnow_customers_weights["recency"], cdnow_customers_weights["T"], weights=cdnow_customers_weights["weights"], ) bgf_no_weights = lt.BetaGeoFitter(penalizer_coef=0.0) bgf_no_weights.fit(cdnow_customers["frequency"], cdnow_customers["recency"], cdnow_customers["T"]) npt.assert_almost_equal( np.array(bgf_no_weights._unload_params("r", "alpha", "a", "b")), np.array(bgf_weights._unload_params("r", "alpha", "a", "b")), decimal=3, )
def test_fit_with_index(self, cdnow_customers): bgf = lt.BetaGeoFitter(penalizer_coef=0.0) index = range(len(cdnow_customers), 0, -1) bgf.fit(cdnow_customers["frequency"], cdnow_customers["recency"], cdnow_customers["T"], index=index) assert (bgf.data.index == index).all() == True bgf = lt.BetaGeoFitter(penalizer_coef=0.0) bgf.fit(cdnow_customers["frequency"], cdnow_customers["recency"], cdnow_customers["T"], index=None) assert (bgf.data.index == index).all() == False
def test_conditional_probability_alive_returns_1_if_no_repeat_purchases( self, cdnow_customers): bfg = lt.BetaGeoFitter() bfg.fit(cdnow_customers["frequency"], cdnow_customers["recency"], cdnow_customers["T"]) assert bfg.conditional_probability_alive(0, 1, 1) == 1.0
def test_params_out_is_close_to_Hardie_paper(self, cdnow_customers): bfg = lt.BetaGeoFitter() bfg.fit(cdnow_customers["frequency"], cdnow_customers["recency"], cdnow_customers["T"]) expected = np.array([0.243, 4.414, 0.793, 2.426]) npt.assert_array_almost_equal( expected, np.array(bfg._unload_params("r", "alpha", "a", "b")), decimal=2)
def test_penalizer_term_will_shrink_coefs_to_0(self, cdnow_customers): bfg_no_penalizer = lt.BetaGeoFitter() bfg_no_penalizer.fit(cdnow_customers["frequency"], cdnow_customers["recency"], cdnow_customers["T"]) params_1 = bfg_no_penalizer.params_ bfg_with_penalizer = lt.BetaGeoFitter(penalizer_coef=0.1) bfg_with_penalizer.fit(cdnow_customers["frequency"], cdnow_customers["recency"], cdnow_customers["T"]) params_2 = bfg_with_penalizer.params_ assert np.all(params_2 < params_1) bfg_with_more_penalizer = lt.BetaGeoFitter(penalizer_coef=10) bfg_with_more_penalizer.fit(cdnow_customers["frequency"], cdnow_customers["recency"], cdnow_customers["T"]) params_3 = bfg_with_more_penalizer.params_ assert np.all(params_3 < params_2)
def test_scaling_inputs_gives_same_or_similar_results( self, cdnow_customers): bgf = lt.BetaGeoFitter() bgf.fit(cdnow_customers["frequency"], cdnow_customers["recency"], cdnow_customers["T"]) scale = 10 bgf_with_large_inputs = lt.BetaGeoFitter() bgf_with_large_inputs.fit(cdnow_customers["frequency"], scale * cdnow_customers["recency"], scale * cdnow_customers["T"]) assert bgf_with_large_inputs._scale < 1.0 assert (abs( bgf_with_large_inputs.conditional_probability_alive( 1, scale * 1, scale * 2) - bgf.conditional_probability_alive(1, 1, 2)) < 10e-5) assert (abs( bgf_with_large_inputs.conditional_probability_alive( 1, scale * 2, scale * 10) - bgf.conditional_probability_alive(1, 2, 10)) < 10e-5)
def test_conditional_probability_alive_is_between_0_and_1( self, cdnow_customers): bfg = lt.BetaGeoFitter() bfg.fit(cdnow_customers["frequency"], cdnow_customers["recency"], cdnow_customers["T"]) for i in range(0, 100, 10): for j in range(0, 100, 10): for k in range(j, 100, 10): assert 0 <= bfg.conditional_probability_alive(i, j, k) <= 1.0
def test_expectation_returns_same_value_Hardie_excel_sheet( self, cdnow_customers): bfg = lt.BetaGeoFitter() bfg.fit(cdnow_customers["frequency"], cdnow_customers["recency"], cdnow_customers["T"], tol=1e-6) times = np.array([0.1429, 1.0, 3.00, 31.8571, 32.00, 78.00]) expected = np.array([0.0078, 0.0532, 0.1506, 1.0405, 1.0437, 1.8576]) actual = bfg.expected_number_of_purchases_up_to_time(times) npt.assert_array_almost_equal(actual, expected, decimal=3)
def test_conditional_probability_alive_matrix(self, cdnow_customers): bfg = lt.BetaGeoFitter() bfg.fit(cdnow_customers["frequency"], cdnow_customers["recency"], cdnow_customers["T"]) Z = bfg.conditional_probability_alive_matrix() max_t = int(bfg.data["T"].max()) assert Z[0][0] == 1 for t_x in range(Z.shape[0]): for x in range(Z.shape[1]): assert Z[t_x][x] == bfg.conditional_probability_alive( x, t_x, max_t)
def test_customer_lifetime_value_with_bgf( self, cdnow_customers_with_monetary_value): ggf = lt.GammaGammaFitter() ggf.params_ = pd.Series({"p": 6.25, "q": 3.74, "v": 15.44}) bgf = lt.BetaGeoFitter() bgf.fit( cdnow_customers_with_monetary_value["frequency"], cdnow_customers_with_monetary_value["recency"], cdnow_customers_with_monetary_value["T"], ) ggf_clv = ggf.customer_lifetime_value( bgf, cdnow_customers_with_monetary_value["frequency"], cdnow_customers_with_monetary_value["recency"], cdnow_customers_with_monetary_value["T"], cdnow_customers_with_monetary_value["monetary_value"], ) utils_clv = utils._customer_lifetime_value( bgf, cdnow_customers_with_monetary_value["frequency"], cdnow_customers_with_monetary_value["recency"], cdnow_customers_with_monetary_value["T"], ggf.conditional_expected_average_profit( cdnow_customers_with_monetary_value["frequency"], cdnow_customers_with_monetary_value["monetary_value"]), ) npt.assert_equal(ggf_clv.values, utils_clv.values) ggf_clv = ggf.customer_lifetime_value( bgf, cdnow_customers_with_monetary_value["frequency"], cdnow_customers_with_monetary_value["recency"], cdnow_customers_with_monetary_value["T"], cdnow_customers_with_monetary_value["monetary_value"], freq="H", ) utils_clv = utils._customer_lifetime_value( bgf, cdnow_customers_with_monetary_value["frequency"], cdnow_customers_with_monetary_value["recency"], cdnow_customers_with_monetary_value["T"], ggf.conditional_expected_average_profit( cdnow_customers_with_monetary_value["frequency"], cdnow_customers_with_monetary_value["monetary_value"]), freq="H", ) npt.assert_equal(ggf_clv.values, utils_clv.values)
def test_conditional_expectation_returns_same_value_as_Hardie_excel_sheet( self, cdnow_customers): bfg = lt.BetaGeoFitter() bfg.fit(cdnow_customers["frequency"], cdnow_customers["recency"], cdnow_customers["T"]) x = 2 t_x = 30.43 T = 38.86 t = 39 expected = 1.226 actual = bfg.conditional_expected_number_of_purchases_up_to_time( t, x, t_x, T) assert abs(expected - actual) < 0.001
def test_no_runtime_warnings_high_frequency(self, cdnow_customers): old_settings = np.seterr(all="raise") bgf = lt.BetaGeoFitter(penalizer_coef=0.0) bgf.fit(cdnow_customers["frequency"], cdnow_customers["recency"], cdnow_customers["T"], index=None) p_alive = bgf.conditional_probability_alive(frequency=1000, recency=10, T=100) np.seterr(**old_settings) assert p_alive == 0.0
def test_save_load(self, cdnow_customers): """Test saving and loading model for BG/NBD.""" bgf = lt.BetaGeoFitter(penalizer_coef=0.0) bgf.fit(cdnow_customers["frequency"], cdnow_customers["recency"], cdnow_customers["T"]) bgf.save_model(PATH_SAVE_BGNBD_MODEL) bgf_new = lt.BetaGeoFitter() bgf_new.load_model(PATH_SAVE_BGNBD_MODEL) assert bgf_new.__dict__["penalizer_coef"] == bgf.__dict__[ "penalizer_coef"] assert bgf_new.__dict__["_scale"] == bgf.__dict__["_scale"] assert bgf_new.__dict__["params_"].equals(bgf.__dict__["params_"]) assert bgf_new.__dict__["_negative_log_likelihood_"] == bgf.__dict__[ "_negative_log_likelihood_"] assert (bgf_new.__dict__["data"] == bgf.__dict__["data"]).all().all() assert bgf_new.__dict__["predict"](1, 1, 2, 5) == bgf.__dict__["predict"](1, 1, 2, 5) assert bgf_new.expected_number_of_purchases_up_to_time( 1) == bgf.expected_number_of_purchases_up_to_time(1) # remove saved model os.remove(PATH_SAVE_BGNBD_MODEL)
def test_conditional_expectation_overflow_error_with_high_frequency( self, cdnow_customers): bfg = lt.BetaGeoFitter() bfg.params_ = OrderedDict([('r', 0.5458741247391189), ('alpha', 13.409316394557274), ('a', 0.0009994943799344323), ('b', 0.03899022143378801)]) t = 180 frequency = 1000 t_x = 0 T = 5 actual = bfg.conditional_expected_number_of_purchases_up_to_time( t, frequency, t_x, T) assert not np.isnan(np.array([actual], dtype=np.float64))
def test_conditional_expectation_with_negative_hyp2f1_term( self, cdnow_customers): bfg = lt.BetaGeoFitter() bfg.params_ = OrderedDict([('r', 0.5458741247391189), ('alpha', 13.409316394557274), ('a', 0.0009994943799344323), ('b', 0.03899022143378801)]) t = 180 x = 0 t_x = 0 T = 5 expected = 5.212 actual = bfg.conditional_expected_number_of_purchases_up_to_time( t, x, t_x, T) assert abs(expected - actual) < 0.001
def test_probability_of_n_purchases_up_to_time_same_as_R_BTYD(self): """ See https://cran.r-project.org/web/packages/BTYD/BTYD.pdf """ bgf = lt.BetaGeoFitter() bgf.params_ = pd.Series({ "r": 0.243, "alpha": 4.414, "a": 0.793, "b": 2.426 }) # probability that a customer will make 10 repeat transactions in the # time interval (0,2] expected = 1.07869e-07 actual = bgf.probability_of_n_purchases_up_to_time(2, 10) assert abs(expected - actual) < 10e-5 # probability that a customer will make no repeat transactions in the # time interval (0,39] expected = 0.5737864 actual = bgf.probability_of_n_purchases_up_to_time(39, 0) assert abs(expected - actual) < 10e-5 # PMF expected = np.array([ 0.0019995214, 0.0015170236, 0.0011633150, 0.0009003148, 0.0007023638, 0.0005517902, 0.0004361913, 0.0003467171, 0.0002769613, 0.0002222260, ]) actual = np.array([ bgf.probability_of_n_purchases_up_to_time(30, n) for n in range(11, 21) ]) npt.assert_array_almost_equal(expected, actual, decimal=5)
calibration_period_end=training_end, observation_period_end=validation_end, monetary_value_col='Sales Total') rfm_train_test = rfm_train_test.loc[rfm_train_test['frequency_cal'] > 0, :] train = rfm_train_test[['frequency_cal', 'recency_cal', 'T_cal']] test = rfm_train_test[['frequency_holdout', 'duration_holdout']] print(rfm_train_test.head()) print(rfm_train_test.shape) # --------------------------------------------------------------------------------------------------------------------------- # TRAIN # ------------------------------------------------------------------------------------------------------------------------- #Beta Geometric / Negative Binomial distribution model (BG/NBD) to predict transactions (Frequency) and churn (Recency)') bgf = lifetimes.BetaGeoFitter(penalizer_coef=0.0) bgf.fit(rfm_train_test['frequency_cal'], rfm_train_test['recency_cal'], rfm_train_test['T_cal']) print(bgf.summary) lifetimes.plotting.plot_calibration_purchases_vs_holdout_purchases( bgf, rfm_train_test) plt.savefig('../images/split.png') plt.show() #_________________________________________________________________________________________________________________________ #PREDICT # -------------------------------------------------------------------------------------------------------------------------- # Probability Alive alive_prediction_bgf = bgf.conditional_probability_alive(