def clv(pareto, mbg, summary): returning_customers_summary = summary[summary['frequency'] > 0] ggf = GammaGammaFitter(penalizer_coef=0.0) ggf.fit(frequency=returning_customers_summary['frequency'], monetary_value=returning_customers_summary['monetary_value']) pred_clv_pareto = ggf.customer_lifetime_value( transaction_prediction_model=pareto, frequency=summary['frequency'], recency=summary['recency'], T=summary['T'], monetary_value=summary['monetary_value'], time=12, freq="D") pred_clv_mbg = ggf.customer_lifetime_value( transaction_prediction_model=mbg, frequency=summary['frequency'], recency=summary['recency'], T=summary['T'], monetary_value=summary['monetary_value'], time=12, freq="D") return pred_clv_pareto, pred_clv_mbg
def readGammaGammaFitterModel(): gammaGammaFitterModel = GammaGammaFitter() gammaGammaFitterModel.load_model("GammaGammaFitterModel.pkl") return gammaGammaFitterModel
def create_cltv_pred(dataframe, w=4, m=1): """ Gamagama and BGNBD model and prediction Parameters ---------- dataframe w: int, week information for BGNBD model m: int, month information for gamama model Returns Dataframe ------- """ # BGNBD dataframe = dataframe[dataframe["monetary_avg"] > 0] dataframe["frequency"] = dataframe["frequency"].astype(int) bgf = BetaGeoFitter(penalizer_coef=0.001) bgf.fit(dataframe['frequency'], dataframe['recency_weekly'], dataframe['T_weekly']) dataframe[f'exp_sales_{w}_week'] = bgf.predict(w, dataframe['frequency'], dataframe['recency_weekly'], dataframe['T_weekly']) # Gamagama - expected_average_profit ggf = GammaGammaFitter(penalizer_coef=0.001) ggf.fit(dataframe['frequency'], dataframe['monetary_avg']) dataframe[ "expected_average_profit"] = ggf.conditional_expected_average_profit( dataframe['frequency'], dataframe['monetary_avg']) # CLTV Prediction cltv = ggf.customer_lifetime_value(bgf, dataframe['frequency'], dataframe['recency_weekly'], dataframe['T_weekly'], dataframe['monetary_avg'], time=m, freq="W", discount_rate=0.01) dataframe[f'cltv_p_{m}_month'] = cltv scaler = MinMaxScaler(feature_range=(1, 100)) dataframe['cltv_p_score'] = scaler.fit_transform( dataframe[[f'cltv_p_{m}_month']]) # cltv_p Segment dataframe['cltv_p_segment'] = pd.qcut(dataframe['cltv_p_score'], 3, labels=['C', 'B', 'A']) new_col = dataframe.columns[~dataframe.columns. isin(['recency', 'frequency', 'monetary'])] dataframe = dataframe[new_col] return dataframe
def run_btyd(model_type, data_src, threshold_date, predict_end): """Run selected BTYD model on data files located in args.data_src. Args: model_type: model type (PARETO, BGNBD) data_src: path to data threshold_date: end date for training data 'YYYY-mm-dd' predict_end: end date for predictions 'YYYY-mm-dd' """ train_end_date = datetime.strptime(threshold_date, '%Y-%m-%d') predict_end_date = datetime.strptime(predict_end, '%Y-%m-%d') # load training transaction data summary, actual_df = load_data(data_src) # train fitter for selected model tf.logging.info('Fitting model...') if model_type == PARETO: fitter = paretonbd_model(summary) elif model_type == BGNBD: fitter = bgnbd_model(summary) tf.logging.info('Done.') # # use trained fitter to compute actual vs predicted ltv for each user # # compute the number of days in the prediction period time_days = (predict_end_date - train_end_date).days time_months = int(math.ceil(time_days / 30.0)) # fit gamma-gamma model tf.logging.info('Fitting GammaGamma model...') ggf = GammaGammaFitter(penalizer_coef=0) ggf.fit(summary['frequency'], summary['monetary_value']) tf.logging.info('Done.') ltv, rmse = predict_value(summary, actual_df, fitter, ggf, time_days, time_months) # output results to csv output_file = os.path.join(data_src, OUTPUT_FILE) ltv.to_csv(output_file, index=False) # log results tf.logging.info('BTYD RMSE error for %s model: %.2f', model_type, rmse) print('RMSE prediction error: %.2f' % rmse)
def predictSpending(customerId): # initialize the data dictionary that will be returned data = {"success": False, "result": {"customerId": "", "y": 0.0}} # ensure the customer ID was properly uploaded to our endpoint if customerId: print("* get data") data = pandas.read_csv("sample_transactions.csv") #data = pandas.read_json(baseURL + "/api/transactions") #data = data.drop(columns="_id") print("* prepare data") # prepare and shaping the data # columns - # customerId # frequency : number of repeat purchase transactions # recency: time (in days) between first purchase and latest purchase # T: time (in days) between first purchase and end of the period under study # monetary_value: average transactions amount today = pandas.to_datetime(datetime.date.today()) summaryData = summary_data_from_transaction_data( data, "customerId", "transactionDate", monetary_value_col="transactionAmount", observation_period_end=today) # filter the customer data that has no transaction analysisData = summaryData[summaryData["frequency"] > 0] # get the stat of the particular customer customer = analysisData.loc[customerId] # load model ggf_loaded = GammaGammaFitter() ggf_loaded.load_model('ggf.pkl') # estimate the average transaction amount predict = ggf_loaded.conditional_expected_average_profit( customer["frequency"], customer['monetary_value']) # add the input and predicted output to the return data data = { "success": True, "result": { "customerId": customerId, "y": predict } } # return the data dictionary as a JSON response return flask.jsonify(data)
def trainGammaGammaModel(): summaryDataFromTransactionDataForCLV = readsummaryDataFromTransactionDataForCLV( ) #getting those customers who have done at least one transaction with the company shortlistedCustomers = summaryDataFromTransactionDataForCLV[ summaryDataFromTransactionDataForCLV["frequency"] > 0] gammaGammaFitterModel = GammaGammaFitter(penalizer_coef=0.0) gammaGammaFitterModel.fit(shortlistedCustomers["frequency"], shortlistedCustomers["monetary_value"]) saveGammaGammaFitterModel(gammaGammaFitterModel)
def estimate_clv_model(summary, model_penalizer=None): #set default values if they are not stated if model_penalizer is None: model_penalizer = 0 # Building the Model using BG/NBD bgf = BetaGeoFitter(penalizer_coef=model_penalizer) bgf.fit(summary['frequency'], summary['recency'], summary['T']) # There cannot be non-positive values in the monetary_value or frequency vector summary_with_value_and_returns = summary[(summary['monetary_value']>0) & (summary['frequency']>0)] # Setting up Gamma Gamma model ggf = GammaGammaFitter(penalizer_coef = 0) ggf.fit(summary_with_value_and_returns['frequency'], summary_with_value_and_returns['monetary_value']) # Refitting the BG/NBD model with the same data if frequency, recency or T are not zero length vectors if not (len(x) == 0 for x in [summary_with_value_and_returns['recency'],summary_with_value_and_returns['frequency'],summary_with_value_and_returns['T']]): bgf.fit(summary_with_value_and_returns['frequency'],summary_with_value_and_returns['recency'],summary_with_value_and_returns['T']) return [bgf, ggf]
def gg_model(rfmmod, bgf, p, f): # Build the Model ret_cust = rfmmod[(rfmmod['frequency'] > 0) & (rfmmod['monetary_value'] > 0)] ggf = GammaGammaFitter(penalizer_coef=p) ggf.fit(ret_cust['frequency'], ret_cust['monetary_value']) pred_clt = ggf.customer_lifetime_value( bgf, ret_cust['frequency'], ret_cust['recency'], ret_cust['T'], ret_cust['monetary_value'], time=12, # months freq=f, discount_rate=0.01) ret_cust['predicted_cltv'] = pred_clt ret_cust['exp_profit'] = ggf.conditional_expected_average_profit( ret_cust['frequency'], ret_cust['monetary_value']) ret_cust = ret_cust.sort_values('predicted_cltv', ascending=False).round(3) return ret_cust
def create_cltv_p(dataframe): today_date = dt.datetime(2011, 12, 11) # rfm metriklerini + tenure oluşturma rfm = dataframe.groupby('Customer ID').agg({'InvoiceDate': [lambda date: (date.max()-date.min()).days, lambda date: (today_date-date.min()).days], 'Invoice': lambda num: num.nunique(), 'TotalPrice': lambda TotalPrice: TotalPrice.sum()}) rfm.columns.droplevel(0) rfm.columns = ['recency_cltv_p', 'T', 'frequency', 'monetary'] # basitleştirilmiş monetary_avg rfm["monetary"] = rfm["monetary"] / rfm["frequency"] rfm.rename(columns={"monetary": "monetary_avg"}, inplace=True) # bgnbd için haftalık recency,tenure hesaplanması rfm["recency_weekly_cltv_p"] = rfm["recency_cltv_p"] / 7 rfm["T_weekly"] = rfm["T"] / 7 # kontrol rfm = rfm[rfm["monetary_avg"] > 0] rfm = rfm[(rfm['frequency'] > 1)] rfm["frequency"] = rfm["frequency"].astype(int) # bgnbd bgf = BetaGeoFitter(penalizer_coef=0.01) bgf.fit(rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly']) # exp_sales_1_month rfm["exp_sales_1_month"] = bgf.predict(4, rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly']) # exp_sales_3_month rfm["exp_sales_3_month"] = bgf.predict(12, rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly']) # expected_avg_profit ggf = GammaGammaFitter(penalizer_coef=0.01) ggf.fit(rfm['frequency'], rfm['monetary_avg']) rfm["expected_average_profit"] = ggf.conditional_expected_average_profit(rfm['frequency'], rfm['monetary_avg']) # 6 aylık cltv_p cltv = ggf.customer_lifetime_value(bgf, rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly'], rfm['monetary_avg'], time=6, freq="W", discount_rate=0.01) rfm["cltv_p"] = cltv # minmaxscaler scaler = MinMaxScaler(feature_range=(1, 100)) scaler.fit(rfm[["cltv_p"]]) rfm["cltv_p"] = scaler.transform(rfm[["cltv_p"]]) # cltv_p_segment rfm["cltv_p_segment"] = pd.qcut(rfm["cltv_p"], 3, labels=["C", "B", "A"]) ## recency_cltv_p, recency_weekly_cltv_p rfm = rfm[["recency_cltv_p", "T", "monetary_avg", "recency_weekly_cltv_p", "T_weekly", "exp_sales_1_month", "exp_sales_3_month", "expected_average_profit", "cltv_p", "cltv_p_segment"]] return rfm
def create_cltv_p(dataframe): today_date = dt.datetime(2011, 12, 11) rfm = dataframe.groupby('Customer ID').agg({ 'InvoiceDate': [ lambda date: (date.max() - date.min()).days, lambda date: (today_date - date.min()).days ], 'Invoice': lambda num: num.nunique(), 'TotalPrice': lambda price: price.sum() }) rfm.columns = rfm.columns.droplevel(0) rfm.columns = ['recency_cltv_p', 'T', 'frequency', 'monetary'] rfm['monetary'] = rfm['monetary'] / rfm['frequency'] rfm.rename(columns={'monetary': 'monetary_avg'}, inplace=True) rfm["recency_weekly_cltv_p"] = rfm['recency_cltv_p'] / 7 rfm['T_weekly'] = rfm['T'] / 7 rfm = rfm[rfm['monetary_avg'] > 0] rfm = rfm[(rfm['frequency'] > 1)] rfm['frequency'] = rfm['frequency'].astype(int) #BGNBD bgf = BetaGeoFitter(penalizer_coef=0.01) bgf.fit(rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly']) rfm["exp_sales_1_month"] = bgf.predict(4, rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly']) rfm["exp_sales_3_month"] = bgf.predict(12, rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly']) #Gamma Gamma ggf = GammaGammaFitter(penalizer_coef=0.01) ggf.fit(rfm['frequency'], rfm['monetary_avg']) rfm["expected_average_profit"] = ggf.conditional_expected_average_profit( rfm['frequency'], rfm['monetary_avg']) cltv = ggf.customer_lifetime_value(bgf, rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly'], rfm['monetary_avg'], time=6, freq='W', discount_rate=0.01) rfm["cltv_p"] = cltv scaler = MinMaxScaler(feature_range=(1, 100)) scaler.fit(rfm[["cltv_p"]]) rfm["cltv_p"] = scaler.transform(rfm[["cltv_p"]]) rfm["cltv_p_segment"] = pd.qcut(rfm["cltv_p"], 3, labels=["C", "B", "A"]) rfm = rfm[[ "recency_cltv_p", "T", "monetary_avg", "recency_weekly_cltv_p", "T_weekly", "exp_sales_1_month", "exp_sales_3_month", "expected_average_profit", "cltv_p", "cltv_p_segment" ]] return rfm
def get_clv(oracle_conn_id, src_client_id, storage_bucket, ds, **context): import matplotlib.pyplot matplotlib.pyplot.ioff() ## from lifetimes.utils import calibration_and_holdout_data from lifetimes.plotting import plot_frequency_recency_matrix from lifetimes.plotting import plot_probability_alive_matrix from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases from lifetimes.plotting import plot_period_transactions from lifetimes.plotting import plot_history_alive from lifetimes.plotting import plot_cumulative_transactions from lifetimes.utils import expected_cumulative_transactions from lifetimes.utils import summary_data_from_transaction_data from lifetimes import BetaGeoFitter from lifetimes import GammaGammaFitter import datetime import pandas as pd import datalab.storage as gcs conn = OracleHook(oracle_conn_id=oracle_conn_id).get_conn() print(src_client_id, context) query = context['templates_dict']['query'] data = pd.read_sql(query, con=conn) data.columns = data.columns.str.lower() print(data.head()) # Calculate RFM values# calibration_end_date = datetime.datetime(2018, 5, 24) training_rfm = calibration_and_holdout_data( transactions=data, customer_id_col='src_user_id', datetime_col='pickup_date', calibration_period_end=calibration_end_date, freq='D', monetary_value_col='price_total') bgf = BetaGeoFitter(penalizer_coef=0.0) bgf.fit(training_rfm['frequency_cal'], training_rfm['recency_cal'], training_rfm['T_cal']) print(bgf) # Matrix charts plot_period_transactions_chart = context.get("ds_nodash") + str( src_client_id) + '_plot_period_transactions_chart.svg' plot_frequency_recency_chart = context.get("ds_nodash") + str( src_client_id) + '_plot_frequency_recency_matrix.svg' plot_probability_chart = context.get("ds_nodash") + str( src_client_id) + '_plot_probability_alive_matrix.svg' plot_calibration_vs_holdout_chart = context.get("ds_nodash") + str( src_client_id) + '_plot_calibration_vs_holdout_purchases.svg' ax0 = plot_period_transactions(bgf, max_frequency=30) ax0.figure.savefig(plot_period_transactions_chart, format='svg') ax1 = plot_frequency_recency_matrix(bgf) ax1.figure.savefig(plot_frequency_recency_chart, format='svg') ax2 = plot_probability_alive_matrix(bgf) ax2.figure.savefig(plot_probability_chart, format='svg') ax3 = plot_calibration_purchases_vs_holdout_purchases(bgf, training_rfm, n=50) ax3.figure.savefig(plot_calibration_vs_holdout_chart, format='svg') full_rfm = summary_data_from_transaction_data( data, customer_id_col='src_user_id', datetime_col='pickup_date', monetary_value_col='price_total', datetime_format=None, observation_period_end=None, freq='D') returning_full_rfm = full_rfm[full_rfm['frequency'] > 0] ggf = GammaGammaFitter(penalizer_coef=0) ggf.fit(returning_full_rfm['frequency'], returning_full_rfm['monetary_value']) customer_lifetime = 30 # expected number of months lifetime of a customer clv = ggf.customer_lifetime_value( bgf, #the model to use to predict the number of future transactions full_rfm['frequency'], full_rfm['recency'], full_rfm['T'], full_rfm['monetary_value'], time=customer_lifetime, # months discount_rate=0.01 # monthly discount rate ~ 12.7% annually ).sort_values(ascending=False) full_rfm_with_value = full_rfm.join(clv) full_rfm_file = context.get("ds_nodash") + "-src_client_id-" + str( src_client_id) + '-icabbi-test.csv' full_rfm_with_value.to_csv(full_rfm_file) GoogleCloudStorageHook( google_cloud_storage_conn_id='google_conn_default').upload( bucket=storage_bucket, object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" + full_rfm_file, filename=full_rfm_file) GoogleCloudStorageHook( google_cloud_storage_conn_id='google_conn_default').upload( bucket=storage_bucket, object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" + plot_period_transactions_chart, filename=full_rfm_file) GoogleCloudStorageHook( google_cloud_storage_conn_id='google_conn_default').upload( bucket=storage_bucket, object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" + plot_frequency_recency_chart, filename=full_rfm_file) GoogleCloudStorageHook( google_cloud_storage_conn_id='google_conn_default').upload( bucket=storage_bucket, object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" + plot_probability_chart, filename=full_rfm_file) GoogleCloudStorageHook( google_cloud_storage_conn_id='google_conn_default').upload( bucket=storage_bucket, object=str(src_client_id) + "/" + context.get("ds_nodash") + "/" + plot_calibration_vs_holdout_chart, filename=full_rfm_file)
def fit_ggf(self): self.ggf = GammaGammaFitter(penalizer_coef = 0) self.ggf.fit(self.return_customers['frequency'], self.return_customers['monetary_value'])
######################################## # BGNBD ######################################## # if you haven't lifetime library. You should install with pip install lifetimes. bgf = BetaGeoFitter(penalizer_coef=0.001) # BGNBD created. bgf.fit(rfm["frequency"], rfm["recency_weekly_p"], rfm["T_weekly"]) # BGNBD fitted. ######################################## # Gamma Gamma ######################################## ggf = GammaGammaFitter(penalizer_coef=0.01) # Gamma Gamma created. ggf.fit(rfm["frequency"], rfm["monetary_avg"]) # Gamma gamma fitted. # 6 Months CLTV Prediction cltv_6_months = ggf.customer_lifetime_value(bgf, rfm['frequency'], rfm['recency_weekly_p'], rfm['T_weekly'], rfm['monetary_avg'], time=6, freq="W", discount_rate=0.01) cltv_6_months = cltv_6_months.reset_index( ) # indexes are broken. Reset_index fixed it.
# 1 aylık beklenen satış ile 3 aylık beklenen satışın grafiklerini çizdidim rfm["exp_P_1"] = bgf.predict(4,rfm['frequency'],rfm['recency_weekly_p'],rfm['T_weekly']) rfm["exp_P_3"] = bgf.predict(4*3,rfm['frequency'],rfm['recency_weekly_p'],rfm['T_weekly']) fig, ax = plt.subplots() ax.plot(rfm.index,rfm["exp_P_1"]) ax.plot(rfm.index,rfm["exp_P_3"]) plt.show() ############################################################## # 3. GAMMA-GAMMA Modelinin Kurulması, ############################################################## rfm.head() ggf = GammaGammaFitter(penalizer_coef=0.01) # ggf model nesnesini oluşturdum ggf.fit(rfm['frequency'], rfm['monetary_avg']) # veri setindeki frequency ve monetary_avg değişkenleri ile gama-gama moelini eğittim # Gamam-Gama modeli ile beklenn ortalama getiriyi hesapladık!!!!!!!!!!!Burada Zamana göre prediction yapamıyor muyuz? Ki zaten beklenen getiri zamanla değişmez mi? ggf.conditional_expected_average_profit(rfm['frequency'], rfm['monetary_avg']).head(10) ggf.conditional_expected_average_profit(rfm['frequency'], rfm['monetary_avg']).sort_values(ascending=False).head(10) rfm["expected_average_profit"] = ggf.conditional_expected_average_profit(rfm['frequency'], rfm['monetary_avg']) rfm.sort_values("expected_average_profit", ascending=False).head(20) rfm.shape
bgf.fit(summary_cal_holdout['frequency_cal'], summary_cal_holdout['recency_cal'], summary_cal_holdout['T_cal']) plot_calibration_purchases_vs_holdout_purchases(bgf, summary_cal_holdout) # Visualization plot_frequency_recency_matrix(bgf) plot_probability_alive_matrix(bgf) plt.show() ### Gamma-Gamma model### returning_customers_summary = data[data['frequency'] > 0] returning_customers_summary[[ 'monetary_value', 'frequency' ]].corr() # Correlation between monetary value and the purchase frequency. ggf = GammaGammaFitter(penalizer_coef=0) ggf.fit(returning_customers_summary['frequency'], returning_customers_summary['monetary_value']) print(ggf) # estimate the average transaction value print( ggf.conditional_expected_average_profit(data['frequency'], data['monetary_value']).head(10)) # refit the BG model to the summary_with_money_value dataset bgf.fit(data['frequency'], data['recency'], data['T']) CLV_12M = ggf.customer_lifetime_value( bgf, # the model to use to predict the number of future transactions data['frequency'],
def create_cltv_p(dataframe): today_date = dt.datetime(2011, 12, 11) ## recency for users dinamic. rfm = dataframe.groupby('Customer ID').agg({'InvoiceDate': [lambda date: (date.max() - date.min()).days, lambda date: (today_date - date.min()).days], 'Invoice': lambda num: num.nunique(), 'TotalPrice': lambda TotalPrice: TotalPrice.sum()}) rfm.columns = rfm.columns.droplevel(0) ## recency_cltv_p rfm.columns = ['recency_cltv_p', 'T', 'frequency', 'monetary'] ## simplified monetary_avg rfm["monetary"] = rfm["monetary"] / rfm["frequency"] rfm.rename(columns={"monetary": "monetary_avg"}, inplace=True) # BGNBD CALCULATE WEEKLY RECENCY AND WEEKLY T for ## recency_weekly_cltv_p rfm["recency_weekly_cltv_p"] = rfm["recency_cltv_p"] / 7 rfm["T_weekly"] = rfm["T"] / 7 # CONTROL rfm = rfm[rfm["monetary_avg"] > 0] ## recency filtre (cltv_p for much better calculation) rfm = rfm[(rfm['frequency'] > 1)] rfm["frequency"] = rfm["frequency"].astype(int) # BGNBD bgf = BetaGeoFitter(penalizer_coef=0.01) bgf.fit(rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly']) # exp_sales_1_month rfm["exp_sales_1_month"] = bgf.predict(4, rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly']) # exp_sales_3_month rfm["exp_sales_3_month"] = bgf.predict(12, rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly']) # expected_average_profit ggf = GammaGammaFitter(penalizer_coef=0.01) ggf.fit(rfm['frequency'], rfm['monetary_avg']) rfm["expected_average_profit"] = ggf.conditional_expected_average_profit(rfm['frequency'], rfm['monetary_avg']) # 6 months cltv_p cltv = ggf.customer_lifetime_value(bgf, rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly'], rfm['monetary_avg'], time=6, freq="W", discount_rate=0.01) rfm["cltv_p"] = cltv # minmaxscaler scaler = MinMaxScaler(feature_range=(1, 100)) scaler.fit(rfm[["cltv_p"]]) rfm["cltv_p"] = scaler.transform(rfm[["cltv_p"]]) # cltv_p_segment rfm["cltv_p_segment"] = pd.qcut(rfm["cltv_p"], 3, labels=["C", "B", "A"]) ## recency_cltv_p, recency_weekly_cltv_p rfm = rfm[["recency_cltv_p", "T", "monetary_avg", "recency_weekly_cltv_p", "T_weekly", "exp_sales_1_month", "exp_sales_3_month", "expected_average_profit", "cltv_p", "cltv_p_segment"]] return rfm
def create_cltv_p(dataframe): today_date = dt.datetime(2011, 12, 11) # recency user-specific rfm = dataframe.groupby('Customer ID').agg({'InvoiceDate': [lambda date: (date.max() - date.min()).days, # "recency_cltv_p" lambda date: (today_date - date.min()).days], # "T" 'Invoice': lambda num: num.nunique(), # "frequency" 'TotalPrice': lambda TotalPrice: TotalPrice.sum()}) # "monetary" rfm.columns = rfm.columns.droplevel(0) # recency_cltv_p rfm.columns = ["recency_cltv_p", "T", "frequency", "monetary"] # Simplified monetary_avg (since Gamma-Gamma model requires this way) rfm["monetary"] = rfm["monetary"] / rfm["frequency"] rfm.rename(columns={"monetary": "monetary_avg"}, inplace=True) # Calculating WEEKLY RECENCY VE WEEKLY T for BG/NBD MODEL # recency_weekly_cltv_p rfm["recency_weekly_cltv_p"] = rfm["recency_cltv_p"] / 7 rfm["T_weekly"] = rfm["T"] / 7 # CHECK IT OUT! Monetary avg must be positive rfm = rfm[rfm["monetary_avg"] > 0] # recency filter rfm = rfm[(rfm["frequency"] > 1)] rfm["frequency"] = rfm["frequency"].astype(int) # converting it to integer just in case! # Establishing the BGNBD Model bgf = BetaGeoFitter(penalizer_coef=0.01) bgf.fit(rfm["frequency"], rfm["recency_weekly_cltv_p"], rfm["T_weekly"]) # exp_sales_1_month rfm["exp_sales_1_month"] = bgf.predict(4, rfm["frequency"], rfm["recency_weekly_cltv_p"], rfm["T_weekly"]) # exp_sales_3_month rfm["exp_sales_3_month"] = bgf.predict(12, rfm["frequency"], rfm["recency_weekly_cltv_p"], rfm["T_weekly"]) # Establishing Gamma-Gamma Model calculates=> Expected Average Profit ggf = GammaGammaFitter(penalizer_coef=0.01) ggf.fit(rfm["frequency"], rfm["monetary_avg"]) rfm["expected_average_profit"] = ggf.conditional_expected_average_profit(rfm["frequency"], rfm["monetary_avg"]) # CLTV Pred for 6 months cltv = ggf.customer_lifetime_value(bgf, rfm["frequency"], rfm["recency_weekly_cltv_p"], rfm["T_weekly"], rfm["monetary_avg"], time=6, freq="W", discount_rate=0.01) rfm["cltv_p"] = cltv # Minmaxscaler scaler = MinMaxScaler(feature_range=(1, 100)) scaler.fit(rfm[["cltv_p"]]) rfm["cltv_p"] = scaler.transform(rfm[["cltv_p"]]) # rfm.fillna(0, inplace=True) # cltv_p_segment rfm["cltv_p_segment"] = pd.qcut(rfm["cltv_p"], 3, labels=["C", "B", "A"]) # recency_cltv_p, recency_weekly_cltv_p rfm = rfm[["recency_cltv_p", "T", "monetary_avg", "recency_weekly_cltv_p", "T_weekly", "exp_sales_1_month", "exp_sales_3_month", "expected_average_profit", "cltv_p", "cltv_p_segment"]] return rfm
customer_detail['p_alive'] = mbgnbd.conditional_probability_alive( customer_detail['frequency'], customer_detail['recency'], customer_detail['T']) customer_detail.head() # In[14]: #The Gamma-Gamma model assumes that there is no relationship between the monetary value and the purchase frequency customer_detail[['avg_order_value', 'frequency']].corr() # In[15]: #It is used to estimate the average monetary value of customer transactions from lifetimes import GammaGammaFitter gg = GammaGammaFitter(penalizer_coef=0.001) gg.fit(customer_detail['frequency'], customer_detail['avg_order_value'], verbose=True) print( gg.conditional_expected_average_profit( customer_detail['frequency'], customer_detail['avg_order_value']).head(10)) # In[16]: customer_detail['clv'] = gg.customer_lifetime_value( mbgnbd, customer_detail['frequency'], customer_detail['recency'],
def get_spend_model(self): return GammaGammaFitter(penalizer_coef=self.penalizer_coef)
def create_cltv_p(dataframe): today_date = dt.datetime(2011, 12, 11) # recency kullanıcıya özel dinamik. rfm = dataframe.groupby('Customer ID').agg({ 'InvoiceDate': [ lambda date: (date.max() - date.min()).days, lambda date: (today_date - date.min()).days ], 'Invoice': lambda num: num.nunique(), 'TotalPrice': lambda TotalPrice: TotalPrice.sum() }) rfm.columns = rfm.columns.droplevel(0) # recency_cltv_p rfm.columns = ['recency_cltv_p', 'T', 'frequency', 'monetary'] # basitleştirilmiş monetary_avg rfm["monetary"] = rfm["monetary"] / rfm["frequency"] rfm.rename(columns={"monetary": "monetary_avg"}, inplace=True) # BGNBD için WEEKLY RECENCY VE WEEKLY T'nin HESAPLANMASI # recency_weekly_cltv_p rfm["recency_weekly_cltv_p"] = rfm["recency_cltv_p"] / 7 rfm["T_weekly"] = rfm["T"] / 7 # KONTROL rfm = rfm[rfm["monetary_avg"] > 0] # recency filtre (daha saglıklı cltvp hesabı için) rfm = rfm[(rfm['frequency'] > 1)] rfm["frequency"] = rfm["frequency"].astype(int) # BGNBD bgf = BetaGeoFitter(penalizer_coef=0.01) bgf.fit(rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly']) # exp_sales_1_month rfm["exp_sales_1_month"] = bgf.predict(4, rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly']) # exp_sales_3_month rfm["exp_sales_3_month"] = bgf.predict(12, rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly']) # expected_average_profit ggf = GammaGammaFitter(penalizer_coef=0.01) ggf.fit(rfm['frequency'], rfm['monetary_avg']) rfm["expected_average_profit"] = ggf.conditional_expected_average_profit( rfm['frequency'], rfm['monetary_avg']) # 6 aylık cltv_p cltv = ggf.customer_lifetime_value(bgf, rfm['frequency'], rfm['recency_weekly_cltv_p'], rfm['T_weekly'], rfm['monetary_avg'], time=6, freq="W", discount_rate=0.01) rfm["cltv_p"] = cltv # minmaxscaler scaler = MinMaxScaler(feature_range=(1, 100)) scaler.fit(rfm[["cltv_p"]]) rfm["cltv_p"] = scaler.transform(rfm[["cltv_p"]]) # rfm.fillna(0, inplace=True) # cltv_p_segment rfm["cltv_p_segment"] = pd.qcut(rfm["cltv_p"], 3, labels=["C", "B", "A"]) # recency_cltv_p, recency_weekly_cltv_p rfm = rfm[[ "recency_cltv_p", "T", "monetary_avg", "recency_weekly_cltv_p", "T_weekly", "exp_sales_1_month", "exp_sales_3_month", "expected_average_profit", "cltv_p", "cltv_p_segment" ]] return rfm
def predicted_purchase_time(account, timesteap): # df = pd.read_csv('AIexcel/' + account + '.csv' , sep=',', names=['name','uuid','invoiceDate','produce_name','Total'],encoding='utf8',low_memory=False) df = pd.read_csv( 'AIexcel/' + account + '.csv', names=['name', 'uuid', 'invoiceDate', 'produce_name', 'Total'], sep=',', encoding='utf8', low_memory=False) #df.rename(columns={u'收件人姓名':u'name', u'收件人手機':u'uuid', u'付款日期':u'invoiceDate', u'商品名稱':u'produce_name', u'商品總價':u'Total'}, inplace=True) df_ga = pd.read_csv('AIexcel/' + account + '_ga.csv', names=['uuid', 'level', 'next_time'], sep=',', encoding='utf8', low_memory=False) df_UserLabel = df_ga['level'][1:].tolist() df_ga.drop([0], inplace=True) if 'level' in df_ga: df_ga['level'] = df_ga.apply(ga_toLevel, axis=1) df = df.ix[df.invoiceDate.str.len() == 19] df = df.ix[df.name.str.len() <= 10] # take three columns df1 = df[['uuid', 'invoiceDate', 'Total']] # drop price == 1 df1_ = df1.drop(df1[df1['invoiceDate'] == 1].index) # drop non-data df_drop = df1_.dropna() # change columns name dataframe = df_drop dataframe['invoiceDate'] = pd.to_datetime(dataframe['invoiceDate']).dt.date dataframe.Total = dataframe.Total.astype(float) data = summary_data_from_transaction_data( dataframe, 'uuid', 'invoiceDate', observation_period_end=dataframe.invoiceDate.max()) data2 = summary_data_from_transaction_data( dataframe, 'uuid', 'invoiceDate', monetary_value_col='Total', observation_period_end=dataframe.invoiceDate.max()) bgf = BetaGeoFitter(penalizer_coef=0.01) bgf.fit(data['frequency'], data['recency'], data['T']) purchase_time = data purchase_time[ 'predicted_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time( 30, data['frequency'], data['recency'], data['T']) predicted_purchases_df = purchase_time[[ 'predicted_purchases' ]].sort_values(by='predicted_purchases', ascending=False) predicted_purchases_df['cycle'] = data['recency'] / data['frequency'] returning_customers_summary = data2[(data2['frequency'] > 0) & (data2['monetary_value'] != 0)] ggf = GammaGammaFitter(penalizer_coef=0.001) ggf.fit(returning_customers_summary['frequency'], returning_customers_summary['monetary_value']) income = ggf.conditional_expected_average_profit( returning_customers_summary['frequency'], returning_customers_summary['monetary_value']).to_frame() income.columns = ['predicted_price'] predicted_purchases_df = predicted_purchases_df.merge(income, on=['uuid'], how='left') predicted_purchases_df.reset_index(inplace=True) mask = predicted_purchases_df.predicted_purchases > 1 predicted_purchases_df.loc[mask, 'predicted_purchases'] = 1 predicted_purchases_df['predicted_purchases'] = predicted_purchases_df[ 'predicted_purchases'].astype(float) predicted_purchases_df = predicted_purchases_df.sort_values( by=['predicted_purchases'], ascending=False) predicted_purchases_df['predicted_purchases'] = predicted_purchases_df[ 'predicted_purchases'].apply(lambda x: format(x, '.2%')) predicted_purchases_df = predicted_purchases_df.merge(df_ga, left_on="uuid", right_on="uuid", how='left') predicted_purchases_df['level'] = predicted_purchases_df.apply(flag_df, axis=1) #predicted_purchases_df['level'] = predicted_purchases_df['level'].fillna(1) predicted_purchases_df.replace(np.nan, 0, inplace=True) predicted_purchases_df.replace(np.inf, 0, inplace=True) if 'next_time' not in predicted_purchases_df.columns: predicted_purchases_df['next_time'] = np.nan predicted_purchases_df['next_time'] = pd.to_datetime( predicted_purchases_df['next_time']) predicted_purchases_df_N = predicted_purchases_df[~( predicted_purchases_df.uuid.isin( ((predicted_purchases_df[predicted_purchases_df.next_time >= today] .uuid).astype(str)).tolist()))] predicted_purchases_df_off = predicted_purchases_df[( predicted_purchases_df.uuid.isin( ((predicted_purchases_df[predicted_purchases_df.next_time >= today] .uuid).astype(str)).tolist()))] new_df = predicted_purchases_df_N.append(predicted_purchases_df_off, ignore_index=True) predicted_purchases_df_N['cycle'] = ( predicted_purchases_df_N['cycle'] * predicted_purchases_df_N['level']).round(0).astype(int) predicted_purchases_df_N[ 'next_time'] = today + predicted_purchases_df_N.apply(time_df, axis=1) predicted_purchases_df_NQ = predicted_purchases_df_N.dropna() predicted_purchases_df_off = predicted_purchases_df_off.drop( columns=['predicted_purchases', 'cycle', 'predicted_price']) predicted_purchases_df_NQ = predicted_purchases_df_NQ.drop( columns=['predicted_purchases', 'cycle', 'predicted_price']) df_ga = df_ga.merge(predicted_purchases_df_off, left_on="uuid", right_on="uuid", how='left') df_ga = df_ga.merge(predicted_purchases_df_NQ, left_on="uuid", right_on="uuid", how='left') notNull_df = df_ga[ df_ga['level'].notnull() & df_ga['next_time'].notnull()].drop( columns=['level_y', 'next_time_y', 'next_time_x', 'level_x']) notNull_df2 = df_ga[ df_ga['level_y'].notnull() & df_ga['next_time_y'].notnull()].drop( columns=['level', 'next_time', 'next_time_x', 'level_x']) notNull_df2.columns = ['uuid', 'level', 'next_time'] res = pd.concat([notNull_df, notNull_df2], axis=0, ignore_index=True) res.rename(columns={u'uuid': u'收件人手機'}, inplace=True) res['UserLabel'] = pd.Series(df_UserLabel) res = res[[u'收件人手機', u'UserLabel', u'next_time']] # res.to_csv('AIexcel/' + account + '_ga.csv',index=False,encoding='utf8') predicted_purchases_df_N = predicted_purchases_df_N.drop( columns=['level', 'cycle', 'next_time']) predicted_purchases_df_N.columns = [u'收件人手機', u'顧客購買機率', u'平均交易金額'] return predicted_purchases_df_N # print(predicted_purchase_time(account,30)[:30])
def generate_clv_table(data, clv_prediction_time=None, model_penalizer=None): #set default values if they are not stated if clv_prediction_time is None: clv_prediction_time = 12 if model_penalizer is None: model_penalizer = 0 # Reformat csv as a Pandas dataframe #data = pd.read_csv(csv_file) #Remove non search sessions data = data[data['Searches'] > 0] max_date = data['activity_date'].max() # Using "summary_data_from_transaction_data" function to agregate the activity stream into the appropriate metrics # Model requires 'activity_date' column name. For our purpose this is synonymous with submission_date. summary = summary_data_from_transaction_data( data, 'client_id', 'activity_date', 'Revenue', observation_period_end=max_date) # Building the Model using BG/NBD bgf = BetaGeoFitter(penalizer_coef=model_penalizer) bgf.fit(summary['frequency'], summary['recency'], summary['T']) # Conditional expected purchases # These are the expected purchases expected from each individual given the time specified # t = days in to future t = 14 summary[ 'predicted_searches'] = bgf.conditional_expected_number_of_purchases_up_to_time( t, summary['frequency'], summary['recency'], summary['T']) #Conditional Alive Probability summary['alive_prob'] = summary.apply( lambda row: calc_alive_prob(row, bgf), axis=1) summary['alive_prob'] = summary['alive_prob'].astype(float) #print summary['alive_prob'] # There cannot be non-positive values in the monetary_value or frequency vector summary_with_value_and_returns = summary[(summary['monetary_value'] > 0) & (summary['frequency'] > 0)] # There cannot be zero length vectors in one of frequency, recency or T #summary_with_value_and_returns = #print summary_with_value_and_returns[ # (len(summary_with_value_and_returns['recency'])>0) & # (len(summary_with_value_and_returns['frequency'])>0) & # (len(summary_with_value_and_returns['T'])>0) #] if any( len(x) == 0 for x in [ summary_with_value_and_returns['recency'], summary_with_value_and_returns['frequency'], summary_with_value_and_returns['T'] ]): logger.debug(data['client_id']) # Setting up Gamma Gamma model ggf = GammaGammaFitter(penalizer_coef=0) ggf.fit(summary_with_value_and_returns['frequency'], summary_with_value_and_returns['monetary_value']) # Output average profit per tranaction by client ID ggf_output = ggf.conditional_expected_average_profit( summary_with_value_and_returns['frequency'], summary_with_value_and_returns['monetary_value']) # Refitting the BG/NBD model with the same data if frequency, recency or T are not zero length vectors if not (len(x) == 0 for x in [ summary_with_value_and_returns['recency'], summary_with_value_and_returns['frequency'], summary_with_value_and_returns['T'] ]): bgf.fit(summary_with_value_and_returns['frequency'], summary_with_value_and_returns['recency'], summary_with_value_and_returns['T']) # Getting Customer lifetime value using the Gamma Gamma output # NOTE: the time can be adjusted, but is currently set to 12 months customer_predicted_value = ggf.customer_lifetime_value( bgf, #the model to use to predict the number of future transactions summary_with_value_and_returns['frequency'], summary_with_value_and_returns['recency'], summary_with_value_and_returns['T'], summary_with_value_and_returns['monetary_value'], time=clv_prediction_time, # months discount_rate=0.01 # monthly discount rate ~ 12.7% annually ) # Converting to dataframe df_cpv = pd.DataFrame({ 'client_id': customer_predicted_value.index, 'pred_values': customer_predicted_value.values }) # Setting client_id as index df_cpv = df_cpv.set_index('client_id') # Merge with original summary df_merged = pd.merge(summary, df_cpv, left_index=True, right_index=True, how='outer') # Historical CLV data_hist = data.groupby( ['client_id'])['Searches', 'Revenue'].apply(lambda x: x.astype(float).sum()) # Merge with original summary df_final = pd.merge(df_merged, data_hist, left_index=True, right_index=True, how='outer') # Prevent NaN on the pred_clv column df_final.pred_values[df_final.frequency == 0] = 0.0 # Create column that combines historical and predicted customer value df_final['total_clv'] = df_final['pred_values'] + df_final['Revenue'] # Create column which calculates in days the number of days since they were last active df_final['last_active'] = df_final['T'] - df_final['recency'] # Create a column which labels users inactive over 14 days as "Expired" ELSE "Active" df_final['user_status'] = np.where(df_final['last_active'] > 14, 'Expired', 'Active') # Add column with date of calculation # Set calc_date to max submission date df_final['calc_date'] = max_date.date() #pd.Timestamp('today').date() # Rename columns as appropriate df_final.columns = [ 'frequency', 'recency', 'customer_age', 'avg_session_value', 'predicted_searches_14_days', 'alive_probability', 'predicted_clv_12_months', 'historical_searches', 'historical_clv', 'total_clv', 'days_since_last_active', 'user_status', 'calc_date' ] #Prevent non returning customers from having 100% alive probability df_final.alive_probability[df_final.frequency == 0] = 0.0 return df_final
def create_cltv_p(dataframe): today_date = dt.datetime(2011, 12, 11) rfm = dataframe.groupby('Customer ID').agg({'InvoiceDate': [lambda date: (today_date - date.max()).days, lambda date: (today_date - date.min()).days], 'Invoice': lambda num: num.nunique(), 'TotalPrice': lambda TotalPrice: TotalPrice.sum()}) rfm.columns = rfm.columns.droplevel(0) rfm.columns = ['recency', 'T', 'frequency', 'monetary'] # CALCULATION OF MONETARY AVG & ADDING RFM INTO DF temp_df = dataframe.groupby(["Customer ID", "Invoice"]).agg({"TotalPrice": ["mean"]}) temp_df = temp_df.reset_index() temp_df.columns = temp_df.columns.droplevel(0) temp_df.columns = ["Customer ID", "Invoice", "total_price_mean"] temp_df2 = temp_df.groupby(["Customer ID"], as_index=False).agg({"total_price_mean": ["mean"]}) temp_df2.columns = temp_df2.columns.droplevel(0) temp_df2.columns = ["Customer ID", "monetary_avg"] rfm = rfm.merge(temp_df2, how="left", on="Customer ID") rfm.set_index("Customer ID", inplace=True) rfm.index = rfm.index.astype(int) # CALCULATION OF WEEKLY RECENCY AND WEEKLY T FOR BGNBD rfm["recency_weekly"] = rfm["recency"] / 7 rfm["T_weekly"] = rfm["T"] / 7 # CONTROL rfm = rfm[rfm["monetary_avg"] > 0] rfm["frequency"] = rfm["frequency"].astype(int) # BGNBD bgf = BetaGeoFitter(penalizer_coef=0.001) bgf.fit(rfm['frequency'], rfm['recency_weekly'], rfm['T_weekly']) # exp_sales_1_month rfm["exp_sales_1_month"] = bgf.predict(4, rfm['frequency'], rfm['recency_weekly'], rfm['T_weekly']) # exp_sales_3_month rfm["exp_sales_3_month"] = bgf.predict(12, rfm['frequency'], rfm['recency_weekly'], rfm['T_weekly']) # expected_average_profit ggf = GammaGammaFitter(penalizer_coef=0.001) ggf.fit(rfm['frequency'], rfm['monetary_avg']) rfm["expected_average_profit"] = ggf.conditional_expected_average_profit(rfm['frequency'], rfm['monetary_avg']) # 6 MONTHS cltv_p cltv = ggf.customer_lifetime_value(bgf, rfm['frequency'], rfm['recency_weekly'], rfm['T_weekly'], rfm['monetary_avg'], time=6, freq="W", discount_rate=0.01) rfm["cltv_p"] = cltv # minmaxscaler scaler = MinMaxScaler(feature_range=(1, 100)) scaler.fit(rfm[["cltv_p"]]) rfm["cltv_p"] = scaler.transform(rfm[["cltv_p"]]) # cltv_p_segment rfm["cltv_p_segment"] = pd.qcut(rfm["cltv_p"], 3, labels=["C", "B", "A"]) rfm = rfm[["monetary_avg", "T", "recency_weekly", "T_weekly", "exp_sales_1_month", "exp_sales_3_month", "expected_average_profit", "cltv_p", "cltv_p_segment"]] return rfm