def probability_alive(historical_rfm_data): """ Predicted Conditional Probability Alive. Parameters ---------- historical_rfm_data: Historical Frequency, Recency & T of an individual Returns ------- Conditional Probability Alive. """ clv_model = BetaGeoFitter(penalizer_coef=0.0) clv_model.load_model(path="models/customer_lifetime_estimator.pkl") alive_probability = clv_model.conditional_probability_alive( frequency=historical_rfm_data["frequency"], recency=historical_rfm_data["recency"], T=historical_rfm_data["T"]) return alive_probability
class CLV(object): """ INPUT pmg_num (int) the product market group number, default = 1 outfile1 (str) the filename indicating where to store the raw data before analysis, default = '../data/clvtrainingset01.csv' outfile2 (str) the filename containing the results, default = '../data/clv01.csv' date_range (list) the start date and end date of the years to analyze, default = ['2008-09-01','2016-09-01'] attributes other than those listed above self.data (DataFrame) a pandas DataFrame object of the data to be used for analysis self.bgf (from lifetimes) a statistical model object from the lifetimes package self.ggf (from lifetimes) a statistical model object from the lifetimes package self.results (DataFrame) a pandas DataFrame object of the results of analysis """ def __init__(self,pmg_num=1,outfile1='../data/clvtrainingset01.csv',outfile2='../data/clv01.csv',date_range=['2008-09-01','2016-09-01']): self.pmg_num = pmg_num # outfile1 stores a clean version of the raw data used for analysis; this is important for reproducibility self.outfile1 = outfile1 # outfile2 stores the clv estimation results self.outfile2 = outfile2 self.date_range = date_range self.data = None self.bgf = None self.ggf = None self.results = None def get_data_from_server(self,cmd=None): """ Gets data from sales_db and stores the query results in self.data INPUT cmd (str) the default sql query is below The default query has been replaced. The original query was an 8 line select command. """ # server name dsn = "THE SERVER NAME" cnxn_name = "DSN=%s" % dsn connection = odbc.connect(cnxn_name) # use to access the database c = connection.cursor() # generate cursor object # Grab transaction data from Postgres if not cmd: cmd = """SQL DEFAULT COMMAND GOES HERE""" % (self.pmg_num,self.date_range[0],self.date_range[1]) c.execute(cmd) # execute the sql command # list to store the query data transaction_data = [] # create a dictionary to convert customer ids to name to_name = dict(np.genfromtxt('../data/names.csv',dtype=str,delimiter='\t')) for row in c: cust, rsv_date, sales = row # pull data from each row of the query data cust_id = str(int(cust)) name = to_name[cust_id] # check to see if customer is inactive if use(name): rsv_date1_readable = rsv_date.strftime('%Y-%m-%d') # date formatting sales_float = float(sales) # convert to float; represents the transaction amount transaction_data.append({"id":cust, "date":rsv_date, "sales":sales_float}) # add dictionary of data to list # convert to dataframe df = pd.DataFrame(transaction_data, columns=['id', 'date', 'sales']) # store results df.to_csv(self.outfile1,index=False) # IMPORTANT: use correct observation_period_end date self.data = summary_data_from_transaction_data(df, 'id', 'date', 'sales', observation_period_end=self.date_range[1], freq='M') def get_data_from_file(self,filename,**kwargs): df = pd.read_csv(filename,**kwargs) self.data = summary_data_from_transaction_data(df, 'id', 'date', 'sales', observation_period_end=self.date_range[1], freq='M') def fit(self,months=96): """ Computes CLV estimates for the next n months and stores results in self.results INPUT months (int) number of months to predict, default = 96 (8 years) """ ### PREDICT NUMBER OF PURCHASES self.bgf = BetaGeoFitter() # see lifetimes module documentation for details self.bgf.fit(self.data['frequency'], self.data['recency'], self.data['T']) # 8 years = 96 months self.data['predicted_purchases'] = self.bgf.conditional_expected_number_of_purchases_up_to_time( months, self.data['frequency'], self.data['recency'], self.data['T']) ### PREDICT FUTURE PURCHASE AMOUNT self.ggf = GammaGammaFitter(penalizer_coef = 0) self.ggf.fit(self.data['frequency'], self.data['monetary_value']) # predict next transaction self.data['predicted_trans_profit'] = self.ggf.conditional_expected_average_profit( frequency = self.data['frequency'], monetary_value = self.data['monetary_value']) ### ESTIMATE CLV self.data['clv_estimation'] = self.data['predicted_trans_profit'] * self.data['predicted_purchases'] self.data['prob_alive'] = self.bgf.conditional_probability_alive( self.data['frequency'], self.data['recency'], self.data['T']) self.results = self.data.sort_values(by='clv_estimation',ascending=False) # store results self.results.to_csv(self.outfile2,index=False) def plot_matrices(self): """ plots three matrices: probability alive matrix: displays the probability that a customer is active frequency recency matrix: displays frequency and recency with color corresponding to monetary value period transactions: displays predicted and actual transaction values over time (check documentation in lifetimes for more details) """ plot_probability_alive_matrix(self.bgf,cmap='viridis') plot_frequency_recency_matrix(self.bgf,cmap='viridis') plot_period_transactions(self.bgf)
from lifetimes import BetaGeoFitter #parameter eliminates overfitting and noise and robust bgf = BetaGeoFitter(penalizer_coef=0.0) bgf.fit(modeldata['frequency'], modeldata['recency'], modeldata['T']) print(bgf) # create frequency recency matrix from lifetimes.plotting import plot_frequency_recency_matrix plot_frequency_recency_matrix(bgf) #plot for predict purchase from lifetimes.plotting import plot_probability_alive_matrix #probability of being alive modeldata['Churn_probability'] = bgf.conditional_probability_alive(modeldata['frequency'], modeldata['recency'], modeldata['T']) #plot for churn or probability of being alive fig = plt.figure(figsize=(12,8)) plot_probability_alive_matrix(bgf) # predict number of purchase customer will make t = 30 #number of days to predict customer will make purchase modeldata['predicted_num_purchases'] = bgf.conditional_expected_number_of_purchases_up_to_time(t, modeldata['frequency'], modeldata['recency'], modeldata['T']) modeldata.sort_values(by='predicted_num_purchases').tail(5) from lifetimes.plotting import plot_period_transactions #used to validate the model plot_period_transactions(bgf)
############################### Customer Life Time Value Calculationn ########## # refit the BG model to the summary_with_money_value dataset, #the model to use to predict the number of future transactions from lifetimes import BetaGeoFitter bgf = BetaGeoFitter(penalizer_coef=0.0) bgf.fit(returning_customers_summary['frequency'], returning_customers_summary['recency'], returning_customers_summary['T']) CLV_1Year = ggf.customer_lifetime_value( bgf, returning_customers_summary['frequency'], returning_customers_summary['recency'], returning_customers_summary['T'], returning_customers_summary['monetary_value'], time=12, freq='D') CLV_1Year = pd.Series(CLV_1Year) ################# Churn Probability ############################### # probability of being churn: model is going to predict customer churn, i.e probability of customer being dead or probability that a customer will leave alive = bgf.conditional_probability_alive( returning_customers_summary['frequency'], returning_customers_summary['recency'], returning_customers_summary['T']) ################ Final Output ############################### returning_customers_summary2 = returning_customers_summary.copy() returning_customers_summary2['Churn_Probability'] = 1 - alive returning_customers_summary2['AVG_SALE'] = AVG_Profit returning_customers_summary2['CLV_1Year'] = CLV_1Year returning_customers_summary2.to_csv('output.csv')