# In[ ]: import matplotlip from matplotlib import pyplot as plt import lifelines from lifelines import KaplanMeierFitter #survival analysis library from lifelines.statistics import logrank_test #survival statistical testing from lifelines import CoxPHFitter df['churn'] = df1.fuga cph = CoxPHFitter() cph.fit(df, duration_col=ypd1['enddt'], event_col=ypd1['FUGA'], show_progress=True) cph.print_summary() cph.plot() # In[ ]: df_2 = df.drop(['enddt', 'FUGA'], axis=1) cph.predict_partial_hazard(df_2) cph.predict_survival_function(df_2, times=[5., 25., 50.]) cph.predict_median(X) kmf = KaplanMeierFitter() T = df['time_to_fuga'] #duration C = df['churn'] #censorship - 1 if death/churn is seen, 0 if censored
# -*- coding: utf-8 -*- # cox regression if __name__ == "__main__": import pandas as pd import time import numpy as np from lifelines import CoxPHFitter from lifelines.datasets import load_rossi df = load_rossi() df = pd.concat([df] * 20) # df = df.reset_index() # df['week'] = np.random.exponential(1, size=df.shape[0]) cp = CoxPHFitter() cp.fit(df, duration_col="week", event_col="arrest", batch_mode=True) start_time = time.time() print(cp.predict_median(df)) print("--- %s seconds ---" % (time.time() - start_time)) cp.print_summary(4)
# now looking at the RemainingValue column we can see which customers would most affect our bottom line. # Great, so we know which customers have the highest risk of churn and when they are likely to, but what can we do? # lets take a look at our coefficients from earlier. # we can see that the features that impact survival positively are 'Contract_One year', 'Contract_Two year', # 'PaymentMethod_Bank transfer (automatic)', 'PaymentMethod_Credit card (automatic)'. Beyond these the results are # insignificant. Lets compare customers with the features to understand the best place to spend money. upgrades = ['Contract_One year', 'Contract_Two year', 'PaymentMethod_Bank transfer (automatic)', 'PaymentMethod_Credit card (automatic)'] results_dict = {} for customer in tqdm(values.index): actual = data.loc[[customer]] change = data.loc[[customer]] results_dict[customer] = [cph.predict_median(actual)] for upgrade in upgrades: change[upgrade] = 1 if list(change[upgrade]) == [0] else 0 results_dict[customer].append(cph.predict_percentile(actual, p=likelihood_cutoff)) change[upgrade] = 1 if list(change[upgrade]) == [0] else 0 results_df = pd.DataFrame(results_dict).T results_df.columns = ['baseline'] + upgrades actions = values.join(results_df).drop([likelihood_cutoff], axis=1) # now we can calculate the difference between applying different features from the baseline actions['CreditCard Diff'] = ( actions['PaymentMethod_Credit card (automatic)'] - actions['baseline'] ) * actions['MonthlyCharges'] actions['BankTransfer Diff'] = ( actions['PaymentMethod_Bank transfer (automatic)'] -
weight_delays=weight_delays) #Create and train Cox Proportional Hazards model cph = CoxPHFitter() if weight_delays != None: cph.fit(n_train, duration_col='Target', weights_col='Weights', show_progress=True) else: cph.fit(n_train, duration_col='Target', show_progress=True) cph.print_summary() #Quantify predictions y_test = n_test.Target.values y_pred = cph.predict_median(n_test).values.T[0] #get mean average error for last 100 days mae_test = sutils.plot_mae_vs_y_true(y_pred, y_test, 100) #Plot results for presentation slides if necessary if plot_result: #Define times at which to plot CDF times = np.arange(100).tolist() #Plot lines delimiting an interval in time and corresponding values of the CDF low_x = 60 high_x = 80 low_y = 1 - cph.predict_survival_function(n_test[70:71], [low_x]) high_y = 1 - cph.predict_survival_function(n_test[70:71], [high_x])