def calc_clv(clv_recs, end, months=12): df = pandas.DataFrame(clv_recs) df = df[['player_id', 'start_date', 'theo_win']] df['theo_win'] = df['theo_win'].astype(float) end_date = parse(end) summary = summary_data_from_transaction_data(df, 'player_id', 'start_date', monetary_value_col='theo_win', observation_period_end=end_date) bgf = BetaGeoFitter(penalizer_coef=0.0) bgf.fit(summary['frequency'], summary['recency'], summary['T']) ggf = GammaGammaFitter(penalizer_coef = 0) ggf.fit(summary['frequency'], summary['monetary_value']) ggf_clv = ggf.customer_lifetime_value( bgf, #the model to use to predict the number of future transactions summary['frequency'], summary['recency'], summary['T'], summary['monetary_value'], time=months, discount_rate=0.0 ) clv_df = pandas.DataFrame(ggf_clv) clv_df=clv_df.dropna() clv_df[clv_df['clv']<0] = 0.0 summary=summary.merge(clv_df, left_index=True, right_index=True, how='inner') return summary
def summaryOutput(self, discount_rate=0.12, months=12): ''' Fit beta geometric model to calculate CLV, and use GG model to calculate expected profit Per customer Write out CLV and profits to csv, print out averages to screen ''' beta_model = BetaGeoFitter() #calulate average transaction value self.summary_monetary['avg_transaction_value'] = self.ggf.conditional_expected_average_profit( self.summary_monetary['frequency'], self.summary_monetary['monetary_value']) #fit beta geo model beta_model.fit(self.summary_monetary['frequency'], self.summary_monetary['recency'], self.summary_monetary['T']) #calculate clv, with discount rate calulated over year (default) disc_rate = discount_rate/months/30 self.summary_monetary['clv'] = self.ggf.customer_lifetime_value( beta_model, #the model to use to predict the number of future transactions self.summary_monetary['frequency'], self.summary_monetary['recency'], self.summary_monetary['T'], self.summary_monetary['monetary_value'], time=months, # months discount_rate=disc_rate # monthly discount rate ~ 12.7% annually ) #print customer data with calculations self.summary_monetary.to_csv("CLV_AVG_transactionValue_perCustomer.csv", index=False) #print summary stats print("Expected conditional average profit: {}, Average profit: {}".format( self.ggf.conditional_expected_average_profit( self.summary_monetary['frequency'], self.summary_monetary['monetary_value']).mean(), self.summary_monetary[self.summary_monetary['frequency']>0]['monetary_value'].mean()))
def train_metric(d, metric, plot=True, penalty=0): frequency = metric + "_frequency" recency = metric + "_recency" T = metric + "_T" train = d train = train[(train[frequency] > 0) & (train[recency] >= 0)] train[frequency] = train[frequency] - 1 bgf = BetaGeoFitter(penalizer_coef=penalty) bgf.fit(train[frequency], train[recency], train[T]) n = bgf.data.shape[0] simulated_data = bgf.generate_new_data(size=n) model_counts = pd.DataFrame( bgf.data["frequency"].value_counts().sort_index().iloc[:28]) simulated_counts = pd.DataFrame( simulated_data["frequency"].value_counts().sort_index().iloc[:28]) combined_counts = model_counts.merge(simulated_counts, how="outer", left_index=True, right_index=True).fillna(0) combined_counts.columns = ["Actual", "Model"] if plot: combined_counts.plot.bar() display() return combined_counts, bgf
def single_customer_evaluation(time_units=243): """ Predicts Number of Purchases of a randomly chosen customer from the dataset. (conditional_expected_number_of_purchases_up_to_time) Parameters ---------- time_units: int, default=243. Number of days for prediction. Returns ------- (frequency_predicted, frequency_holdout) """ # Loading Calibration Model. cal_bg_nbd = BetaGeoFitter(penalizer_coef=0.0) cal_bg_nbd.load_model(path="models/calibration_model.pkl") # Loading summary_cal_holdout dataset. summary_cal_holdout = pd.read_csv("datasets/summary_cal_holdout.csv") # Randomly sample single customer. individual = summary_cal_holdout.sample() frequency_prediction = cal_bg_nbd.predict( t=time_units, frequency=individual["frequency_cal"], recency=individual["recency_cal"], T=individual["T_cal"]) frequency_holdout = individual["frequency_holdout"] return frequency_prediction, frequency_holdout
def test_plot_incremental_transactions(self): """Test plotting incremental transactions with CDNOW example.""" transactions = load_dataset('CDNOW_sample.txt', header=None, sep='\s+') transactions.columns = [ 'id_total', 'id_sample', 'date', 'num_cd_purc', 'total_value' ] t = 39 freq = 'W' transactions_summary = utils.summary_data_from_transaction_data( transactions, 'id_sample', 'date', datetime_format='%Y%m%d', observation_period_end='19970930', freq=freq) bgf = BetaGeoFitter(penalizer_coef=0.01) bgf.fit(transactions_summary['frequency'], transactions_summary['recency'], transactions_summary['T']) plt.figure() plotting.plot_incremental_transactions(bgf, transactions, 'date', 'id_sample', 2 * t, t, freq=freq, xlabel='week', datetime_format='%Y%m%d') return plt.gcf()
def bgf(cd_data): bgf_model = BetaGeoFitter() bgf_model.fit(cd_data['frequency'], cd_data['recency'], cd_data['T'], iterative_fitting=1) return bgf_model
def fitted_bg(example_summary_data): bg = BetaGeoFitter() bg.fit(example_summary_data['frequency'], example_summary_data['recency'], example_summary_data['T'], iterative_fitting=0) return bg
def readBetaGeoFitterModel(): betaGeoFitterModel = BetaGeoFitter() betaGeoFitterModel.load_model("BetaGeoFitterModel.pkl") return betaGeoFitterModel
def create_cltv_pred(dataframe, w=4, m=1): """ Gamagama and BGNBD model and prediction Parameters ---------- dataframe w: int, week information for BGNBD model m: int, month information for gamama model Returns Dataframe ------- """ # BGNBD dataframe = dataframe[dataframe["monetary_avg"] > 0] dataframe["frequency"] = dataframe["frequency"].astype(int) bgf = BetaGeoFitter(penalizer_coef=0.001) bgf.fit(dataframe['frequency'], dataframe['recency_weekly'], dataframe['T_weekly']) dataframe[f'exp_sales_{w}_week'] = bgf.predict(w, dataframe['frequency'], dataframe['recency_weekly'], dataframe['T_weekly']) # Gamagama - expected_average_profit ggf = GammaGammaFitter(penalizer_coef=0.001) ggf.fit(dataframe['frequency'], dataframe['monetary_avg']) dataframe[ "expected_average_profit"] = ggf.conditional_expected_average_profit( dataframe['frequency'], dataframe['monetary_avg']) # CLTV Prediction cltv = ggf.customer_lifetime_value(bgf, dataframe['frequency'], dataframe['recency_weekly'], dataframe['T_weekly'], dataframe['monetary_avg'], time=m, freq="W", discount_rate=0.01) dataframe[f'cltv_p_{m}_month'] = cltv scaler = MinMaxScaler(feature_range=(1, 100)) dataframe['cltv_p_score'] = scaler.fit_transform( dataframe[[f'cltv_p_{m}_month']]) # cltv_p Segment dataframe['cltv_p_segment'] = pd.qcut(dataframe['cltv_p_score'], 3, labels=['C', 'B', 'A']) new_col = dataframe.columns[~dataframe.columns. isin(['recency', 'frequency', 'monetary'])] dataframe = dataframe[new_col] return dataframe
def test_expected_cumulative_transactions_date_index(cdnow_transactions): """ Test set_index as date for cumulative transactions and bgf fitter. Get first 14 cdnow transactions dates and validate that date index, freq_multiplier = 1 working and compare with tested data for last 4 records. dates = ['1997-01-11', '1997-01-12', '1997-01-13', '1997-01-14'] actual_trans = [11, 12, 15, 19] expected_trans = [10.67, 12.67, 14.87, 17.24] """ datetime_col = "date" customer_id_col = "id_sample" t = 14 datetime_format = "%Y%m%d" freq = "D" observation_period_end = "19970930" freq_multiplier = 1 transactions_summary = utils.summary_data_from_transaction_data( cdnow_transactions, customer_id_col, datetime_col, datetime_format=datetime_format, freq=freq, freq_multiplier=freq_multiplier, observation_period_end=observation_period_end, ) transactions_summary = transactions_summary.reset_index() model = BetaGeoFitter() model.fit(transactions_summary["frequency"], transactions_summary["recency"], transactions_summary["T"]) df_cum = utils.expected_cumulative_transactions( model, cdnow_transactions, datetime_col, customer_id_col, t, datetime_format, freq, set_index_date=True, freq_multiplier=freq_multiplier, ) dates = ["1997-01-11", "1997-01-12", "1997-01-13", "1997-01-14"] actual_trans = [11, 12, 15, 19] expected_trans = [10.67, 12.67, 14.87, 17.24] date_index = df_cum.iloc[-4:].index.to_timestamp().astype(str) actual = df_cum["actual"].iloc[-4:].values predicted = df_cum["predicted"].iloc[-4:].values.round(2) assert all(dates == date_index) assert_allclose(actual, actual_trans) assert_allclose(predicted, expected_trans, atol=1e-2)
def bgf_transactions(cdnow_transactions): transactions_summary = utils.summary_data_from_transaction_data( cdnow_transactions, 'id_sample', 'date', datetime_format='%Y%m%d', observation_period_end='19970930', freq='W') bgf = BetaGeoFitter(penalizer_coef=0.01) bgf.fit(transactions_summary['frequency'], transactions_summary['recency'], transactions_summary['T']) return bgf
def fitted_bg(example_summary_data): bg = BetaGeoFitter() bg.fit( example_summary_data["frequency"], example_summary_data["recency"], example_summary_data["T"], iterative_fitting=2, tol=1e-6, ) return bg
def bgnbd_model(summary): """Instantiate and fit a BG/NBD model. Args: summary: RFM transaction data Returns: bgnbd model fit to the data """ bgf = BetaGeoFitter(penalizer_coef=PENALIZER_COEF) bgf.fit(summary['frequency'], summary['recency'], summary['T']) return bgf
def rfm_model(data, end_date, f, p): rfm1 = lifetimes.utils.summary_data_from_transaction_data( data, 'customer_id', 'date', monetary_value_col='amount', observation_period_end=end_date, freq=f) rfm1 = rfm1[rfm1.monetary_value < 600] bgf = BetaGeoFitter(penalizer_coef=p) bgf.fit(rfm1['frequency'], rfm1['recency'], rfm1['T']) return rfm1, bgf
def evaluation_plots(plot_type): """ Evaluation Plots: - Tracking Cumulative Transactions - Tracking Daily Transactions - Frequency of Repeated Transactions - Calibration vs Holdout. Parameters ---------- plot_type: str. "tracking" - Tracking Cumulative and Tracking Daily Transactions. "repeated" - Frequency of Repeated Transactions. "calibration_holdout" - Calibration vs Holdout Purchases. """ # Loading Calibration Model. cal_bg_nbd = BetaGeoFitter(penalizer_coef=0.0) cal_bg_nbd.load_model(path="models/calibration_model.pkl") # Loading summary_cal_holdout dataset. summary_cal_holdout = pd.read_csv("datasets/summary_cal_holdout.csv") # Loading Transactions. transactions = pd.read_csv("datasets/transactions.csv") if plot_type == "tracking": fig = plt.figure(figsize=(20, 4)) plot_cumulative_transactions(model=cal_bg_nbd, transactions=transactions, datetime_col="order_purchase_timestamp", customer_id_col="customer_unique_id", t=604, t_cal=512, freq="D", ax=fig.add_subplot(121)) plot_incremental_transactions(model=cal_bg_nbd, transactions=transactions, datetime_col="order_purchase_timestamp", customer_id_col="customer_unique_id", t=604, t_cal=512, freq="D", ax=fig.add_subplot(122)) elif plot_type == "repeated": plot_period_transactions(model=cal_bg_nbd) elif plot_type == "calibration_holdout": plot_calibration_purchases_vs_holdout_purchases( model=cal_bg_nbd, calibration_holdout_matrix=summary_cal_holdout) return
def upload(): # -*- coding: utf-8 -*- if request.method == 'POST': f = request.files['file'] basepath = os.path.dirname(__file__) file_path = os.path.join(basepath, 'uploads', secure_filename(f.filename)) f.save(file_path) df = pd.read_csv(file_path) df['salesDate'] = pd.to_datetime(df['salesDate']) cols_of_interest = ['memberID', 'salesDate', 'sales'] df = df[cols_of_interest] df['memberID'] = df['memberID'].apply(lambda x: format(x, '.0f')) max_date = df['salesDate'].max() min_date = max_date - relativedelta(months=+12) df = df.loc[(df['salesDate'] >= min_date) & (df['salesDate'] <= max_date)] min_order = df['salesDate'].min() max_order = df['salesDate'].max() data = summary_data_from_transaction_data( df, 'memberID', 'salesDate', monetary_value_col='sales', observation_period_end=max_order) d2 = data.sort_values('frequency', ascending=False) bgf = BetaGeoFitter(penalizer_coef=0.0001) bgf.fit(data['frequency'], data['recency'], data['T']) t = 30 data[ 'customer_livelyhood'] = bgf.conditional_expected_number_of_purchases_up_to_time( t, data['frequency'], data['recency'], data['T']) data.sort_values(by='customer_livelyhood', ascending=False, inplace=True) return data.to_html() return None
def trainBetaGeoFitterModel(): summaryDataFromTransactionDataForCLV = readsummaryDataFromTransactionDataForCLV( ) #training model betaGeoFitterModel = BetaGeoFitter(penalizer_coef=0.0) betaGeoFitterModel.fit(summaryDataFromTransactionDataForCLV["frequency"], summaryDataFromTransactionDataForCLV["recency"], summaryDataFromTransactionDataForCLV["T"]) #saving the model in pickle file saveBetaGeoFitterModel(betaGeoFitterModel) print(betaGeoFitterModel.summary)
def fit(self,months=96): """ Computes CLV estimates for the next n months and stores results in self.results INPUT months (int) number of months to predict, default = 96 (8 years) """ ### PREDICT NUMBER OF PURCHASES self.bgf = BetaGeoFitter() # see lifetimes module documentation for details self.bgf.fit(self.data['frequency'], self.data['recency'], self.data['T']) # 8 years = 96 months self.data['predicted_purchases'] = self.bgf.conditional_expected_number_of_purchases_up_to_time( months, self.data['frequency'], self.data['recency'], self.data['T']) ### PREDICT FUTURE PURCHASE AMOUNT self.ggf = GammaGammaFitter(penalizer_coef = 0) self.ggf.fit(self.data['frequency'], self.data['monetary_value']) # predict next transaction self.data['predicted_trans_profit'] = self.ggf.conditional_expected_average_profit( frequency = self.data['frequency'], monetary_value = self.data['monetary_value']) ### ESTIMATE CLV self.data['clv_estimation'] = self.data['predicted_trans_profit'] * self.data['predicted_purchases'] self.data['prob_alive'] = self.bgf.conditional_probability_alive( self.data['frequency'], self.data['recency'], self.data['T']) self.results = self.data.sort_values(by='clv_estimation',ascending=False) # store results self.results.to_csv(self.outfile2,index=False)
def probability_alive(historical_rfm_data): """ Predicted Conditional Probability Alive. Parameters ---------- historical_rfm_data: Historical Frequency, Recency & T of an individual Returns ------- Conditional Probability Alive. """ clv_model = BetaGeoFitter(penalizer_coef=0.0) clv_model.load_model(path="models/customer_lifetime_estimator.pkl") alive_probability = clv_model.conditional_probability_alive( frequency=historical_rfm_data["frequency"], recency=historical_rfm_data["recency"], T=historical_rfm_data["T"]) return alive_probability
def root_mean_squared_error(time_units=243): """ Calculates Root Mean Squared Error of all predictions. Parameters ---------- time_units: int, default=243. Number of days for prediction. Yields ------ summary_cal_holdout_preds.csv. Returns ------ rmse """ # Loading Calibration Model. cal_bg_nbd = BetaGeoFitter(penalizer_coef=0.0) cal_bg_nbd.load_model(path="models/calibration_model.pkl") # Loading summary_cal_holdout dataset. summary_cal_holdout = pd.read_csv("datasets/summary_cal_holdout.csv") frequency_holdout = summary_cal_holdout["frequency_holdout"].copy() # Predictions. frequency_predictions = cal_bg_nbd.predict( t=time_units, frequency=summary_cal_holdout["frequency_cal"], recency=summary_cal_holdout["recency_cal"], T=summary_cal_holdout["T_cal"]) # Adding Predictions to Summary dataset. summary_cal_holdout["frequency_predictions"] = frequency_predictions.copy() file_path = Path.cwd() / "datasets/summary_cal_holdout_preds.csv" summary_cal_holdout.to_csv(file_path, index=False) rmse = mean_squared_error(frequency_holdout, frequency_predictions, squared=False) return rmse
def number_of_purchases(historical_rfm_data, time_units=30): """ Predicted Conditional Expected Number of Purchases. Parameters ---------- historical_rfm_data: Historical Frequency, Recency & T of an individual time_units: int, default=30. Number of days for predictions. Returns ------- expected number of purchases. """ clv_model = BetaGeoFitter(penalizer_coef=0.0) clv_model.load_model(path="models/customer_lifetime_estimator.pkl") frequency_predictions = clv_model.predict( t=time_units, frequency=historical_rfm_data["frequency"], recency=historical_rfm_data["recency"], T=historical_rfm_data["T"]) return frequency_predictions
def estimate_clv_model(summary, model_penalizer=None): #set default values if they are not stated if model_penalizer is None: model_penalizer = 0 # Building the Model using BG/NBD bgf = BetaGeoFitter(penalizer_coef=model_penalizer) bgf.fit(summary['frequency'], summary['recency'], summary['T']) # There cannot be non-positive values in the monetary_value or frequency vector summary_with_value_and_returns = summary[(summary['monetary_value']>0) & (summary['frequency']>0)] # Setting up Gamma Gamma model ggf = GammaGammaFitter(penalizer_coef = 0) ggf.fit(summary_with_value_and_returns['frequency'], summary_with_value_and_returns['monetary_value']) # Refitting the BG/NBD model with the same data if frequency, recency or T are not zero length vectors if not (len(x) == 0 for x in [summary_with_value_and_returns['recency'],summary_with_value_and_returns['frequency'],summary_with_value_and_returns['T']]): bgf.fit(summary_with_value_and_returns['frequency'],summary_with_value_and_returns['recency'],summary_with_value_and_returns['T']) return [bgf, ggf]
def _calibration_model(): """ Trains BG/NBD Calibration Model. Yields ------ calibration_model.pkl """ summary_cal_holdout = pd.read_csv("datasets/summary_cal_holdout.csv") # Training Calibration Model. cal_bg_nbd = BetaGeoFitter(penalizer_coef=0.0) cal_bg_nbd.fit(frequency=summary_cal_holdout["frequency_cal"], recency=summary_cal_holdout["recency_cal"], T=summary_cal_holdout["T_cal"], verbose=True) # Saving Model. file_path = Path.cwd() / "models/calibration_model.pkl" cal_bg_nbd.save_model(path=file_path) return
def _clv_model(): """ Trains BG/NBD Model on entire RFM data, final fit. Yields ------ customer_lifetime_estimator.pkl """ summary = pd.read_csv("datasets/summary.csv") # Training Calibration Model. clv = BetaGeoFitter(penalizer_coef=0.0) clv.fit(frequency=summary["frequency"], recency=summary["recency"], T=summary["T"], verbose=True) # Saving Model. file_path = Path.cwd() / "models/customer_lifetime_estimator.pkl" clv.save_model(path=file_path) return
customer’s purchases divided by the total number of purchases. Note that the denominator here is different than the frequency described above. """ data = summary_data_from_transaction_data( df, customer_id, date_col, monetary_value_col='Sales', ) # observation_period_end='2011-12-9') # default period end date is the date when the last transaction happened ### Basic Frequency/Recency analysis using the BG/NBD model ### """ BG/NBD is an attractive alternative to the Pareto/NBD, which costs less computation and yields similar results. """ bgf = BetaGeoFitter(penalizer_coef=0.0) bgf.fit(data['frequency'], data['recency'], data['T']) print(bgf) # For small samples sizes, the parameters can get implausibly large, so by adding an l2 penalty the likelihood, # we can control how large these parameters can be. This is implemented as setting as positive penalizer_coef in the # initialization of the model. In typical applications, penalizers on the order of 0.001 to 0.1 are effective. # Model fit plot_period_transactions(bgf) # Calibration summary_cal_holdout = calibration_and_holdout_data( df, customer_id, date_col, calibration_period_end='2011-06-08', observation_period_end='2011-12-9')
def test_everything(X_train, y_train, X_test, y_test): ''' 1) test whether Full AdaBoost model performs better than BG/NBD 2) test whether AdaBoost model trained on same variables performs better 3) test Adaboost splitted in 8 RFM groups vs AdaBoost at once vs AdaBoost at RFM 4) alternative test/train split ''' ##################### ## FULL ADABOOST ## ##################### print_annotation('FULL ADABOOST') ada = AdaBoostClassifier() n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=4)] learning_rate = [x for x in np.linspace(start=0.1, stop=1, num=4)] random_grid = {'n_estimators': n_estimators, 'learning_rate': learning_rate} clf = GridSearchCV(ada, random_grid, verbose=False, n_jobs=3, scoring='f1').fit(X_train, y_train) # print(clf.best_params_) y_pred_full_ada = clf.predict(X_test) print(confusion_matrix(y_test, y_pred_full_ada)) print(classification_report(y_test, y_pred_full_ada)) ######################## ## PARTIAL ADABOOST ## ######################## print_annotation('PARTIAL ADABOOST') ada = AdaBoostClassifier() n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=4)] learning_rate = [x for x in np.linspace(start=0.1, stop=1, num=4)] random_grid = {'n_estimators': n_estimators, 'learning_rate': learning_rate} clf = GridSearchCV(ada, random_grid, verbose=False, n_jobs=3, scoring='f1') \ .fit(X_train[['txn_total', 'recency_true', 'T']], y_train) y_pred_part_ada = clf.predict(X_test[['txn_total', 'recency_true', 'T']]) print(confusion_matrix(y_test, y_pred_part_ada)) print(classification_report(y_test, y_pred_part_ada)) ################## ### BG/NBD ### ################## print_annotation('BG/NBD') bgf = BetaGeoFitter(penalizer_coef=0.0) bgf.fit(X_train['txn_total'], X_train['recency_true'] / 7, X_train['T'] / 7) t = 52 y_pred_bgnbd = bgf \ .conditional_expected_number_of_purchases_up_to_time( t, X_test['txn_total'], X_test['recency_true'] / 7, X_test['T'] / 7 ) for threshold in np.linspace(0.7, 1.8, 4): threshold = round(threshold, 2) print('_' * 25) print(f"BG/NBD threshold: {threshold}") y_pred_bgnbd_tf = y_pred_bgnbd < threshold print('churn rate: ' + str(sum(y_pred_bgnbd_tf) / len(y_pred_bgnbd_tf))) print(confusion_matrix(y_test, y_pred_bgnbd_tf)) print(classification_report(y_test, y_pred_bgnbd_tf)) ############################# ### ALTERNATIVE SPLIT ### ############################# print('_' * 25) print('_,-*-,' * 4) print('_' * 25) print_annotation('FULL ADABOOST alt split') X_train_alt, X_test_alt, y_train_alt, y_test_alt = \ train_test_split(X_test, y_test, test_size=0.33, random_state=42) ada = AdaBoostClassifier() n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=4)] learning_rate = [x for x in np.linspace(start=0.1, stop=1, num=4)] random_grid = {'n_estimators': n_estimators, 'learning_rate': learning_rate} clf = GridSearchCV(ada, random_grid, verbose=False, n_jobs=3, scoring='f1').fit(X_train_alt, y_train_alt) # print(clf.best_params_) y_pred_ada_alt = clf.predict(X_test_alt) print(confusion_matrix(y_test_alt, y_pred_ada_alt)) print(classification_report(y_test_alt, y_pred_ada_alt)) ###################################### print_annotation('PARTIAL ADABOOST alt split') ada = AdaBoostClassifier() n_estimators = [int(x) for x in np.linspace(start=100, stop=1000, num=4)] learning_rate = [x for x in np.linspace(start=0.1, stop=1, num=4)] random_grid = {'n_estimators': n_estimators, 'learning_rate': learning_rate} clf = GridSearchCV(ada, random_grid, verbose=False, n_jobs=3, scoring='f1') \ .fit(X_train_alt[['txn_total', 'recency_true', 'T']], y_train_alt) y_pred_part_ada_alt = clf.predict( X_test_alt[['txn_total', 'recency_true', 'T']]) print(confusion_matrix(y_test_alt, y_pred_part_ada_alt)) print(classification_report(y_test_alt, y_pred_part_ada_alt)) ###################################### print_annotation('BD/NBD alt split') bgf = BetaGeoFitter(penalizer_coef=0.0) bgf.fit(X_train_alt['txn_total'], X_train_alt['recency_true'] / 7, X_train_alt['T'] / 7) t = 52 y_pred_bgnbd_ALT = bgf \ .conditional_expected_number_of_purchases_up_to_time( t, X_test_alt['txn_total'], X_test_alt['recency_true'] / 7, X_test_alt['T'] / 7 ) for threshold in np.linspace(0.2, 2.5, 6): print('_' * 25) print(f"BG/NBD threshold: {threshold}") y_pred_bgnbd_tf_alt = y_pred_bgnbd_ALT < threshold print('churn rate: ' + str(sum(y_pred_bgnbd_tf_alt) / len(y_pred_bgnbd_tf_alt))) print(confusion_matrix(y_test_alt, y_pred_bgnbd_tf_alt)) print(classification_report(y_test_alt, y_pred_bgnbd_tf_alt))
yr_pred = clf.predict(X_test) print(confusion_matrix(y_test, yr_pred)) print(classification_report(y_test, yr_pred)) X_test['churn'] = y_test2 X_test['pred_8m'] = y_pred X_test['pred_1m'] = yr_pred #%% X_test.to_csv('matrix.csv') #%% from lifetimes import BetaGeoFitter # similar API to scikit-learn and lifelines. bgf = BetaGeoFitter(penalizer_coef=0.0) bgf.fit(X_train['txn_total'], X_train['recency_true']/7, X_train['T']/7) print(bgf) %matplotlib inline from lifetimes.plotting import plot_frequency_recency_matrix plot_frequency_recency_matrix(bgf) #%% from lifetimes.plotting import plot_probability_alive_matrix f=plot_probability_alive_matrix(bgf) t=52
# removal of test records and negative value df.drop(df[df["RECENCY"] > df["T"]].index, inplace=True) #df.drop(df[df["MONETARY_VALUE"] <= 10.00].index, inplace = True) # ========================================================================== # Data check # ========================================================================== # Order distribution by frequency df["FREQUENCY"].plot(kind="hist", bins=50) # ========================================================================== # BG/NBD model # ========================================================================== bgf = BetaGeoFitter(penalizer_coef=0.01) bgf.fit(df["FREQUENCY"], df["RECENCY"], df["T"]) bgf.summary plotting.plot_frequency_recency_matrix(bgf) plotting.plot_probability_alive_matrix(bgf) # Repeat transaction model check plotting.plot_period_transactions(bgf) # ========================================================================== # Ranking reps from best to worst # ========================================================================== t = 1
def load_data_and_model(): """Loads Customer Lifetime Estimator Model""" model = BetaGeoFitter(penalizer_coef=0.0) model.load_model("../models/calibration_model.pkl") summary_cal_holdout = pd.read_csv("../datasets/summary_cal_holdout.csv") return model, summary_cal_holdout
import lifetimes from lifetimes import BetaGeoFitter from lifetimes.plotting import plot_frequency_recency_matrix from lifetimes.plotting import plot_probability_alive_matrix import pandas as pd data = pd.read_csv('lifetimes') bgf = BetaGeoFitter(penalizer_coef=0.0) bgf.fit(data['frequency'], data['recency'], data['T']) print bgf plot_frequency_recency_matrix(bgf) #plot_probability_alive_matrix(bgf)
import os import pandas as pd import pytest import matplotlib matplotlib.use("AGG") # use a non-interactive backend from matplotlib import pyplot as plt from lifetimes import plotting from lifetimes import BetaGeoFitter, ParetoNBDFitter, ModifiedBetaGeoFitter from lifetimes.datasets import load_cdnow, load_transaction_data from lifetimes import utils bgf = BetaGeoFitter() cd_data = load_cdnow() bgf.fit(cd_data["frequency"], cd_data["recency"], cd_data["T"], iterative_fitting=0) @pytest.mark.plottest class TestPlotting: @pytest.mark.mpl_image_compare(tolerance=30) def test_plot_period_transactions(self): plt.figure() plotting.plot_period_transactions(bgf) return plt.gcf() @pytest.mark.mpl_image_compare(tolerance=30) def test_plot_period_transactions_parento(self): pnbd = ParetoNBDFitter() pnbd.fit(cd_data["frequency"], cd_data["recency"], cd_data["T"], iterative_fitting=0)
class CLV(object): """ INPUT pmg_num (int) the product market group number, default = 1 outfile1 (str) the filename indicating where to store the raw data before analysis, default = '../data/clvtrainingset01.csv' outfile2 (str) the filename containing the results, default = '../data/clv01.csv' date_range (list) the start date and end date of the years to analyze, default = ['2008-09-01','2016-09-01'] attributes other than those listed above self.data (DataFrame) a pandas DataFrame object of the data to be used for analysis self.bgf (from lifetimes) a statistical model object from the lifetimes package self.ggf (from lifetimes) a statistical model object from the lifetimes package self.results (DataFrame) a pandas DataFrame object of the results of analysis """ def __init__(self,pmg_num=1,outfile1='../data/clvtrainingset01.csv',outfile2='../data/clv01.csv',date_range=['2008-09-01','2016-09-01']): self.pmg_num = pmg_num # outfile1 stores a clean version of the raw data used for analysis; this is important for reproducibility self.outfile1 = outfile1 # outfile2 stores the clv estimation results self.outfile2 = outfile2 self.date_range = date_range self.data = None self.bgf = None self.ggf = None self.results = None def get_data_from_server(self,cmd=None): """ Gets data from sales_db and stores the query results in self.data INPUT cmd (str) the default sql query is below The default query has been replaced. The original query was an 8 line select command. """ # server name dsn = "THE SERVER NAME" cnxn_name = "DSN=%s" % dsn connection = odbc.connect(cnxn_name) # use to access the database c = connection.cursor() # generate cursor object # Grab transaction data from Postgres if not cmd: cmd = """SQL DEFAULT COMMAND GOES HERE""" % (self.pmg_num,self.date_range[0],self.date_range[1]) c.execute(cmd) # execute the sql command # list to store the query data transaction_data = [] # create a dictionary to convert customer ids to name to_name = dict(np.genfromtxt('../data/names.csv',dtype=str,delimiter='\t')) for row in c: cust, rsv_date, sales = row # pull data from each row of the query data cust_id = str(int(cust)) name = to_name[cust_id] # check to see if customer is inactive if use(name): rsv_date1_readable = rsv_date.strftime('%Y-%m-%d') # date formatting sales_float = float(sales) # convert to float; represents the transaction amount transaction_data.append({"id":cust, "date":rsv_date, "sales":sales_float}) # add dictionary of data to list # convert to dataframe df = pd.DataFrame(transaction_data, columns=['id', 'date', 'sales']) # store results df.to_csv(self.outfile1,index=False) # IMPORTANT: use correct observation_period_end date self.data = summary_data_from_transaction_data(df, 'id', 'date', 'sales', observation_period_end=self.date_range[1], freq='M') def get_data_from_file(self,filename,**kwargs): df = pd.read_csv(filename,**kwargs) self.data = summary_data_from_transaction_data(df, 'id', 'date', 'sales', observation_period_end=self.date_range[1], freq='M') def fit(self,months=96): """ Computes CLV estimates for the next n months and stores results in self.results INPUT months (int) number of months to predict, default = 96 (8 years) """ ### PREDICT NUMBER OF PURCHASES self.bgf = BetaGeoFitter() # see lifetimes module documentation for details self.bgf.fit(self.data['frequency'], self.data['recency'], self.data['T']) # 8 years = 96 months self.data['predicted_purchases'] = self.bgf.conditional_expected_number_of_purchases_up_to_time( months, self.data['frequency'], self.data['recency'], self.data['T']) ### PREDICT FUTURE PURCHASE AMOUNT self.ggf = GammaGammaFitter(penalizer_coef = 0) self.ggf.fit(self.data['frequency'], self.data['monetary_value']) # predict next transaction self.data['predicted_trans_profit'] = self.ggf.conditional_expected_average_profit( frequency = self.data['frequency'], monetary_value = self.data['monetary_value']) ### ESTIMATE CLV self.data['clv_estimation'] = self.data['predicted_trans_profit'] * self.data['predicted_purchases'] self.data['prob_alive'] = self.bgf.conditional_probability_alive( self.data['frequency'], self.data['recency'], self.data['T']) self.results = self.data.sort_values(by='clv_estimation',ascending=False) # store results self.results.to_csv(self.outfile2,index=False) def plot_matrices(self): """ plots three matrices: probability alive matrix: displays the probability that a customer is active frequency recency matrix: displays frequency and recency with color corresponding to monetary value period transactions: displays predicted and actual transaction values over time (check documentation in lifetimes for more details) """ plot_probability_alive_matrix(self.bgf,cmap='viridis') plot_frequency_recency_matrix(self.bgf,cmap='viridis') plot_period_transactions(self.bgf)
def generate_clv_table(data, clv_prediction_time=None, model_penalizer=None): #set default values if they are not stated if clv_prediction_time is None: clv_prediction_time = 12 if model_penalizer is None: model_penalizer = 0 # Reformat csv as a Pandas dataframe #data = pd.read_csv(csv_file) #Remove non search sessions data = data[data['Searches'] > 0] max_date = data['activity_date'].max() # Using "summary_data_from_transaction_data" function to agregate the activity stream into the appropriate metrics # Model requires 'activity_date' column name. For our purpose this is synonymous with submission_date. summary = summary_data_from_transaction_data( data, 'client_id', 'activity_date', 'Revenue', observation_period_end=max_date) # Building the Model using BG/NBD bgf = BetaGeoFitter(penalizer_coef=model_penalizer) bgf.fit(summary['frequency'], summary['recency'], summary['T']) # Conditional expected purchases # These are the expected purchases expected from each individual given the time specified # t = days in to future t = 14 summary[ 'predicted_searches'] = bgf.conditional_expected_number_of_purchases_up_to_time( t, summary['frequency'], summary['recency'], summary['T']) #Conditional Alive Probability summary['alive_prob'] = summary.apply( lambda row: calc_alive_prob(row, bgf), axis=1) summary['alive_prob'] = summary['alive_prob'].astype(float) #print summary['alive_prob'] # There cannot be non-positive values in the monetary_value or frequency vector summary_with_value_and_returns = summary[(summary['monetary_value'] > 0) & (summary['frequency'] > 0)] # There cannot be zero length vectors in one of frequency, recency or T #summary_with_value_and_returns = #print summary_with_value_and_returns[ # (len(summary_with_value_and_returns['recency'])>0) & # (len(summary_with_value_and_returns['frequency'])>0) & # (len(summary_with_value_and_returns['T'])>0) #] if any( len(x) == 0 for x in [ summary_with_value_and_returns['recency'], summary_with_value_and_returns['frequency'], summary_with_value_and_returns['T'] ]): logger.debug(data['client_id']) # Setting up Gamma Gamma model ggf = GammaGammaFitter(penalizer_coef=0) ggf.fit(summary_with_value_and_returns['frequency'], summary_with_value_and_returns['monetary_value']) # Output average profit per tranaction by client ID ggf_output = ggf.conditional_expected_average_profit( summary_with_value_and_returns['frequency'], summary_with_value_and_returns['monetary_value']) # Refitting the BG/NBD model with the same data if frequency, recency or T are not zero length vectors if not (len(x) == 0 for x in [ summary_with_value_and_returns['recency'], summary_with_value_and_returns['frequency'], summary_with_value_and_returns['T'] ]): bgf.fit(summary_with_value_and_returns['frequency'], summary_with_value_and_returns['recency'], summary_with_value_and_returns['T']) # Getting Customer lifetime value using the Gamma Gamma output # NOTE: the time can be adjusted, but is currently set to 12 months customer_predicted_value = ggf.customer_lifetime_value( bgf, #the model to use to predict the number of future transactions summary_with_value_and_returns['frequency'], summary_with_value_and_returns['recency'], summary_with_value_and_returns['T'], summary_with_value_and_returns['monetary_value'], time=clv_prediction_time, # months discount_rate=0.01 # monthly discount rate ~ 12.7% annually ) # Converting to dataframe df_cpv = pd.DataFrame({ 'client_id': customer_predicted_value.index, 'pred_values': customer_predicted_value.values }) # Setting client_id as index df_cpv = df_cpv.set_index('client_id') # Merge with original summary df_merged = pd.merge(summary, df_cpv, left_index=True, right_index=True, how='outer') # Historical CLV data_hist = data.groupby( ['client_id'])['Searches', 'Revenue'].apply(lambda x: x.astype(float).sum()) # Merge with original summary df_final = pd.merge(df_merged, data_hist, left_index=True, right_index=True, how='outer') # Prevent NaN on the pred_clv column df_final.pred_values[df_final.frequency == 0] = 0.0 # Create column that combines historical and predicted customer value df_final['total_clv'] = df_final['pred_values'] + df_final['Revenue'] # Create column which calculates in days the number of days since they were last active df_final['last_active'] = df_final['T'] - df_final['recency'] # Create a column which labels users inactive over 14 days as "Expired" ELSE "Active" df_final['user_status'] = np.where(df_final['last_active'] > 14, 'Expired', 'Active') # Add column with date of calculation # Set calc_date to max submission date df_final['calc_date'] = max_date.date() #pd.Timestamp('today').date() # Rename columns as appropriate df_final.columns = [ 'frequency', 'recency', 'customer_age', 'avg_session_value', 'predicted_searches_14_days', 'alive_probability', 'predicted_clv_12_months', 'historical_searches', 'historical_clv', 'total_clv', 'days_since_last_active', 'user_status', 'calc_date' ] #Prevent non returning customers from having 100% alive probability df_final.alive_probability[df_final.frequency == 0] = 0.0 return df_final
import os import pandas as pd import pytest import matplotlib matplotlib.use('AGG') # use a non-interactive backend from matplotlib import pyplot as plt from lifetimes import plotting from lifetimes import BetaGeoFitter, ParetoNBDFitter, ModifiedBetaGeoFitter from lifetimes.datasets import load_cdnow, load_transaction_data from lifetimes import utils bgf = BetaGeoFitter() cd_data = load_cdnow() bgf.fit(cd_data['frequency'], cd_data['recency'], cd_data['T'], iterative_fitting=1) @pytest.mark.plottest class TestPlotting(): @pytest.mark.mpl_image_compare(tolerance=30) def test_plot_period_transactions(self): plt.figure() plotting.plot_period_transactions(bgf) return plt.gcf() @pytest.mark.mpl_image_compare(tolerance=30) def test_plot_period_transactions_parento(self):
recency.rename(columns={"step": "recency"}, inplace=True) frequency.rename(columns={"step": "frequency"}, inplace=True) T.rename(columns={"step": "T"}, inplace=True) monetary.rename(columns={"amount": "monetary_value"}, inplace=True) df_rfm = pd.concat([recency, T, monetary, frequency], axis=1) ggf = GammaGammaFitter(penalizer_coef=0) ggf.fit(frequency=df_rfm["frequency"], monetary_value=df_rfm["monetary_value"]) df_rfm["expected_monetary_value"] = df_rfm.apply( lambda row: ggf.conditional_expected_average_profit( row["frequency"], row["monetary_value"]), axis=1) bgf = BetaGeoFitter(penalizer_coef=1) bgf.fit(frequency=df_rfm["frequency"], recency=df_rfm["recency"], T=df_rfm["T"]) df_rfm[ "pred_nb_purchases"] = bgf.conditional_expected_number_of_purchases_up_to_time( t=180, frequency=df_rfm["frequency"], recency=df_rfm["recency"], T=df_rfm["T"]) df_rfm["pred_revenue"] = df_rfm.apply( lambda row: row["pred_nb_purchases"] * row["expected_monetary_value"], axis=1)
import os import pandas as pd import pytest import matplotlib matplotlib.use('AGG') # use a non-interactive backend from matplotlib import pyplot as plt from lifetimes import plotting from lifetimes import BetaGeoFitter, ParetoNBDFitter, ModifiedBetaGeoFitter from lifetimes.datasets import load_cdnow, load_transaction_data from lifetimes import utils bgf = BetaGeoFitter() cd_data = load_cdnow() bgf.fit(cd_data['frequency'], cd_data['recency'], cd_data['T'], iterative_fitting=1) @pytest.mark.plottest class TestPlotting(): @pytest.mark.mpl_image_compare(tolerance=30) def test_plot_period_transactions(self): plt.figure() plotting.plot_period_transactions(bgf) return plt.gcf() @pytest.mark.mpl_image_compare(tolerance=30) def test_plot_period_transactions_parento(self): pnbd = ParetoNBDFitter() pnbd.fit(cd_data['frequency'], cd_data['recency'], cd_data['T'], iterative_fitting=1)