def test_plot_probability_alive_matrix(self): from matplotlib import pyplot as plt plt.figure() plotting.plot_probability_alive_matrix(bgf) plt.figure() plotting.plot_probability_alive_matrix(bgf, max_recency=100, max_frequency=50) plt.show()
def test_plot_probability_alive_matrix_max_frequency_max_recency( self, bgf): shape = (101, 101) col_idx = 15 col = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.001, 0.001, 0.001, 0.002, 0.002, 0.003, 0.004, 0.006, 0.008, 0.010, 0.012, 0.016, 0.020, 0.025, 0.031, 0.039, 0.048, 0.059, 0.072, 0.088, 0.106, 0.126, 0.150, 0.178, 0.208, 0.242, 0.278, 0.318, 0.359, 0.403, 0.447, 0.492, 0.536, 0.579, 0.621, 0.660, 0.697, 0.731, 0.763, 0.791, 0.817, 0.839, 0.860, 0.877, 0.893, 0.907, 0.919, 0.929, 0.939, 0.947, 0.953 ] ax = plotting.plot_probability_alive_matrix(bgf, max_frequency=100, max_recency=100) ar = ax.get_images()[0].get_array() assert_array_equal(ar.shape, shape) assert_allclose(ar[:, col_idx].data, col, atol=0.01) # only test one column for brevity assert_equal( ax.title.get_text(), "Probability Customer is Alive,\nby Frequency and Recency of a Customer" ) assert_equal(ax.xaxis.get_label().get_text(), "Customer's Historical Frequency") assert_equal(ax.yaxis.get_label().get_text(), "Customer's Recency") plt.close()
def test_plot_probability_alive_matrix_max_recency(self, bgf): shape = (101, 30) col_idx = 25 col = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.001, 0.001, 0.002, 0.003, 0.004, 0.006, 0.008, 0.012, 0.017, 0.023, 0.032, 0.043, 0.058, 0.078, 0.103, 0.134, 0.173, 0.219, 0.273, 0.333, 0.399, 0.468, 0.537, 0.604, 0.667, 0.724, 0.774, 0.816, 0.852, 0.882, 0.906, 0.925, 0.941, 0.953, 0.963, 0.970 ] ax = plotting.plot_probability_alive_matrix(bgf, max_recency=100) ar = ax.get_images()[0].get_array() assert_array_equal(ar.shape, shape) assert_allclose(ar[:, col_idx].data, col, atol=0.01) # only test one column for brevity assert_equal( ax.title.get_text(), "Probability Customer is Alive,\nby Frequency and Recency of a Customer" ) assert_equal(ax.xaxis.get_label().get_text(), "Customer's Historical Frequency") assert_equal(ax.yaxis.get_label().get_text(), "Customer's Recency") plt.close()
def test_plot_probability_alive_matrix_max_frequency(self, bgf): shape = (39, 101) row_idx = 35 row = [ 1.0, 0.736, 0.785, 0.814, 0.833, 0.846, 0.855, 0.862, 0.866, 0.869, 0.871, 0.872, 0.873, 0.873, 0.872, 0.871, 0.869, 0.867, 0.865, 0.862, 0.859, 0.856, 0.852, 0.848, 0.844, 0.839, 0.834, 0.829, 0.823, 0.817, 0.811, 0.805, 0.798, 0.791, 0.783, 0.775, 0.767, 0.759, 0.750, 0.741, 0.731, 0.721, 0.711, 0.701, 0.690, 0.679, 0.667, 0.656, 0.644, 0.631, 0.619, 0.606, 0.593, 0.580, 0.566, 0.552, 0.539, 0.525, 0.511, 0.496, 0.482, 0.468, 0.454, 0.439, 0.425, 0.411, 0.397, 0.383, 0.369, 0.355, 0.342, 0.329, 0.316, 0.303, 0.290, 0.278, 0.266, 0.254, 0.243, 0.232, 0.221, 0.211, 0.201, 0.191, 0.182, 0.173, 0.164, 0.156, 0.148, 0.140, 0.133, 0.126, 0.119, 0.113, 0.106, 0.101, 0.095, 0.090, 0.085, 0.080, 0.075 ] ax = plotting.plot_probability_alive_matrix(bgf, max_frequency=100) ar = ax.get_images()[0].get_array() assert_array_equal(ar.shape, shape) assert_allclose(ar[row_idx, :].data, row, atol=0.01) # only test one row for brevity assert_equal( ax.title.get_text(), "Probability Customer is Alive,\nby Frequency and Recency of a Customer" ) assert_equal(ax.xaxis.get_label().get_text(), "Customer's Historical Frequency") assert_equal(ax.yaxis.get_label().get_text(), "Customer's Recency") plt.close()
def viz_bgf(self, t): #visualize customer frequency and recency matrix plot_frequency_recency_matrix(self.bgf, T=t, cmap='coolwarm') plt.savefig('sales_frequency_recency_matrix.png') plt.close() #visualize customer alive probability plot_probability_alive_matrix(self.bgf, cmap='coolwarm') plt.savefig('probability_alive_matrix.png') plt.close() #visualize expected repeat Purchases plot_expected_repeat_purchases(self.bgf) plt.savefig('ProbabilityExpectedRepeatPurchases.png') plt.close() #visualize the expected number of period transactions plot_period_transactions(self.bgf) plt.savefig('period_transactions.png') plt.close()
def test_plot_probability_alive_matrix(self, bgf): shape = (39, 30) row_idx = 35 row = [1.0, 0.736, 0.785, 0.814, 0.833, 0.846, 0.855, 0.862, 0.866, 0.869, 0.871, 0.872, 0.873, 0.873, 0.872, 0.871, 0.869, 0.867, 0.865, 0.862, 0.859, 0.856, 0.852, 0.848, 0.844, 0.839, 0.834, 0.829, 0.823, 0.817] ax = plotting.plot_probability_alive_matrix(bgf) ar = ax.get_images()[0].get_array() assert_array_equal(ar.shape, shape) assert_allclose(ar[row_idx, :].data, row, atol=0.01) # only test one row for brevity assert_equal(ax.title.get_text(), "Probability Customer is Alive,\nby Frequency and Recency of a Customer") assert_equal(ax.xaxis.get_label().get_text(), "Customer's Historical Frequency") assert_equal(ax.yaxis.get_label().get_text(), "Customer's Recency") plt.close()
def test_plot_probability_alive_matrix_max_frequency(self): plt.figure() plotting.plot_probability_alive_matrix(bgf, max_frequency=100) return plt.gcf()
def test_plot_probability_alive_matrix(self): plt.figure() plotting.plot_probability_alive_matrix(bgf) return plt.gcf()
def get_clv(oracle_conn_id, src_client_id, storage_bucket, ds, **context): import matplotlib.pyplot matplotlib.pyplot.ioff() ## from lifetimes.utils import calibration_and_holdout_data from lifetimes.plotting import plot_frequency_recency_matrix from lifetimes.plotting import plot_probability_alive_matrix from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases from lifetimes.plotting import plot_period_transactions from lifetimes.plotting import plot_history_alive from lifetimes.plotting import plot_cumulative_transactions from lifetimes.utils import expected_cumulative_transactions from lifetimes.utils import summary_data_from_transaction_data from lifetimes import BetaGeoFitter from lifetimes import GammaGammaFitter import datetime import pandas as pd import datalab.storage as gcs conn = OracleHook(oracle_conn_id=oracle_conn_id).get_conn() print(src_client_id, context) query = context['templates_dict']['query'] data = pd.read_sql(query, con=conn) data.columns = data.columns.str.lower() print(data.head()) # Calculate RFM values# calibration_end_date = datetime.datetime(2018, 5, 24) training_rfm = calibration_and_holdout_data( transactions=data, customer_id_col='src_user_id', datetime_col='pickup_date', calibration_period_end=calibration_end_date, freq='D', monetary_value_col='price_total') bgf = BetaGeoFitter(penalizer_coef=0.0) bgf.fit(training_rfm['frequency_cal'], training_rfm['recency_cal'], training_rfm['T_cal']) print(bgf) # Matrix charts plot_period_transactions_chart = context.get("ds_nodash") + str( src_client_id) + '_plot_period_transactions_chart.svg' plot_frequency_recency_chart = context.get("ds_nodash") + str( src_client_id) + '_plot_frequency_recency_matrix.svg' plot_probability_chart = context.get("ds_nodash") + str( src_client_id) + '_plot_probability_alive_matrix.svg' plot_calibration_vs_holdout_chart = context.get("ds_nodash") + str( src_client_id) + '_plot_calibration_vs_holdout_purchases.svg' ax0 = plot_period_transactions(bgf, max_frequency=30) ax0.figure.savefig(plot_period_transactions_chart, format='svg') ax1 = plot_frequency_recency_matrix(bgf) ax1.figure.savefig(plot_frequency_recency_chart, format='svg') ax2 = plot_probability_alive_matrix(bgf) ax2.figure.savefig(plot_probability_chart, format='svg') ax3 = plot_calibration_purchases_vs_holdout_purchases(bgf, training_rfm, n=50) ax3.figure.savefig(plot_calibration_vs_holdout_chart, format='svg') full_rfm = summary_data_from_transaction_data( data, customer_id_col='src_user_id', datetime_col='pickup_date', monetary_value_col='price_total', datetime_format=None, observation_period_end=None, freq='D') returning_full_rfm = full_rfm[full_rfm['frequency'] > 0] ggf = GammaGammaFitter(penalizer_coef=0) ggf.fit(returning_full_rfm['frequency'], returning_full_rfm['monetary_value']) customer_lifetime = 30 # expected number of months lifetime of a customer clv = ggf.customer_lifetime_value( bgf, #the model to use to predict the number of future transactions full_rfm['frequency'], full_rfm['recency'], full_rfm['T'], full_rfm['monetary_value'], time=customer_lifetime, # months discount_rate=0.01 # monthly discount rate ~ 12.7% annually ).sort_values(ascending=False) full_rfm_with_value = full_rfm.join(clv) full_rfm_file = context.get("ds_nodash") + "-src_client_id-" + str( src_client_id) + '-icabbi-test.csv' full_rfm_with_value.to_csv(full_rfm_file) GoogleCloudStorageHook( google_cloud_storage_conn_id='google_conn_default').upload( bucket=storage_bucket, object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" + full_rfm_file, filename=full_rfm_file) GoogleCloudStorageHook( google_cloud_storage_conn_id='google_conn_default').upload( bucket=storage_bucket, object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" + plot_period_transactions_chart, filename=full_rfm_file) GoogleCloudStorageHook( google_cloud_storage_conn_id='google_conn_default').upload( bucket=storage_bucket, object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" + plot_frequency_recency_chart, filename=full_rfm_file) GoogleCloudStorageHook( google_cloud_storage_conn_id='google_conn_default').upload( bucket=storage_bucket, object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" + plot_probability_chart, filename=full_rfm_file) GoogleCloudStorageHook( google_cloud_storage_conn_id='google_conn_default').upload( bucket=storage_bucket, object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" + plot_calibration_vs_holdout_chart, filename=full_rfm_file)
def test_plot_probability_alive_matrix_max_frequency_max_recency(self): plt.figure() plotting.plot_probability_alive_matrix(bgf, max_frequency=100, max_recency=100) return plt.gcf()
def visualizeProbabilityAliveMatrix(betaGeoFitterModel): plot_probability_alive_matrix(betaGeoFitterModel) pylab.savefig("ProbabilityAliveMatrixPlot.png")
display() # COMMAND ---------- # MAGIC %md From this chart, we can see this customer made his or her first purchase in January 2011 followed by a repeat purchase later that month. There was about a 1-month lull in activity during which the probability of the customer being alive declined slightly but with purchases in March, April and June of that year, the customer sent repeated signals that he or she was engaged. Since that last June purchase, the customer hasn't been seen in our transaction history, and our belief that the customer remains engaged has been dropping though as a moderate pace given the signals previously sent. # MAGIC # MAGIC How does the model arrive at these probabilities? The exact math is tricky but by plotting the probability of being alive as a heatmap relative to frequency and recency, we can understand the probabilities assigned to the intersections of these two values: # COMMAND ---------- from lifetimes.plotting import plot_probability_alive_matrix # set figure size plt.subplots(figsize=(12, 8)) plot_probability_alive_matrix(model) display() # COMMAND ---------- # MAGIC %md In addition to predicting the probability a customer is still alive, we can calculate the number of purchases expected from a customer over a given future time interval, such as over the next 30-days: # COMMAND ---------- from lifetimes.plotting import plot_frequency_recency_matrix # set figure size plt.subplots(figsize=(12, 8)) plot_frequency_recency_matrix(model, T=30)
on='customer_id') df_final['wholesaler'] = np.where(df_final['predicted_cltv'] < 1000, 0, 1) df_final['churn_group'] = np.where(df_final['probability_alive'] < .5, 0, 1) df_final # Plots and Validation plot_period_transactions(bgf_mod) cal_hold = calibration_and_holdout_data( df, 'customer_id', 'date', calibration_period_end='2018-12-31', #3 years calibration observation_period_end='2020-12-31', #2 year holdout freq=frq) # plots the efficiacy of the model using the hold-out period plt.rcParams['figure.figsize'] = (20, 10) bgf = BetaGeoFitter() bgf.fit(cal_hold['frequency_cal'], cal_hold['recency_cal'], cal_hold['T_cal']) plot_calibration_purchases_vs_holdout_purchases(bgf, cal_hold) fig = plt.figure(figsize=(8, 6)) plot_frequency_recency_matrix(bgf_mod) fig = plt.figure(figsize=(8, 6)) plot_probability_alive_matrix(bgf_mod)
# Data check # ========================================================================== # Order distribution by frequency df["FREQUENCY"].plot(kind="hist", bins=50) # ========================================================================== # BG/NBD model # ========================================================================== bgf = BetaGeoFitter(penalizer_coef=0.01) bgf.fit(df["FREQUENCY"], df["RECENCY"], df["T"]) bgf.summary plotting.plot_frequency_recency_matrix(bgf) plotting.plot_probability_alive_matrix(bgf) # Repeat transaction model check plotting.plot_period_transactions(bgf) # ========================================================================== # Ranking reps from best to worst # ========================================================================== t = 1 df["predicted_purchases"] = bgf.conditional_expected_number_of_purchases_up_to_time( t, df["FREQUENCY"], df["RECENCY"], df["T"]) df.sort_values(by="predicted_purchases").tail(10) # ========================================================================== # Gamma Gamme Model
# similar API to scikit-learn and lifelines. bgf = BetaGeoFitter(penalizer_coef=0.0) bgf.fit(X_train['txn_total'], X_train['recency_true']/7, X_train['T']/7) print(bgf) %matplotlib inline from lifetimes.plotting import plot_frequency_recency_matrix plot_frequency_recency_matrix(bgf) #%% from lifetimes.plotting import plot_probability_alive_matrix f=plot_probability_alive_matrix(bgf) t=52 X_train['predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time( t, X_train['txn_total'], X_train['recency_true']/7, X_train['T']/7) #%% from lifetimes.plotting import plot_period_transactions f = plot_period_transactions(bgf) #%% X_train.sort_values('predicted_purchases') #%% # X_train.sort_values(by='predicted_purchases').head(5) from lifetimes.plotting import plot_period_transactions f = plot_period_transactions(bgf)
bgf = BetaGeoFitter(penalizer_coef=0.0) bgf.fit(summary['frequency'], summary['recency'], summary['T']) print(bgf) bgf.summary from lifetimes.plotting import plot_frequency_recency_matrix import matplotlib.pyplot as plt plt.rcParams['figure.figsize'] = [10, 10] plot_frequency_recency_matrix(bgf, title="") from lifetimes.plotting import plot_probability_alive_matrix plot_probability_alive_matrix(bgf, title="") t = 12 summary[ 'predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time( t, summary['frequency'], summary['recency'], summary['T']) summary.sort_values(by='predicted_purchases').tail(10) from lifetimes.plotting import plot_period_transactions plt.rcParams['figure.figsize'] = [12, 3] plot_period_transactions(bgf) from lifetimes.utils import calibration_and_holdout_data summary_cal_holdout = calibration_and_holdout_data(