Exemplo n.º 1
0
# In[ ]:

import matplotlip
from matplotlib import pyplot as plt
import lifelines
from lifelines import KaplanMeierFitter  #survival analysis library
from lifelines.statistics import logrank_test  #survival statistical testing
from lifelines import CoxPHFitter

df['churn'] = df1.fuga

cph = CoxPHFitter()
cph.fit(df,
        duration_col=ypd1['enddt'],
        event_col=ypd1['FUGA'],
        show_progress=True)
cph.print_summary()
cph.plot()

# In[ ]:

df_2 = df.drop(['enddt', 'FUGA'], axis=1)
cph.predict_partial_hazard(df_2)
cph.predict_survival_function(df_2, times=[5., 25., 50.])
cph.predict_median(X)

kmf = KaplanMeierFitter()
T = df['time_to_fuga']  #duration
C = df['churn']  #censorship - 1 if death/churn is seen, 0 if censored
Exemplo n.º 2
0
# -*- coding: utf-8 -*-
# cox regression


if __name__ == "__main__":
    import pandas as pd
    import time
    import numpy as np

    from lifelines import CoxPHFitter
    from lifelines.datasets import load_rossi

    df = load_rossi()
    df = pd.concat([df] * 20)
    # df = df.reset_index()
    # df['week'] = np.random.exponential(1, size=df.shape[0])
    cp = CoxPHFitter()
    cp.fit(df, duration_col="week", event_col="arrest", batch_mode=True)
    start_time = time.time()
    print(cp.predict_median(df))
    print("--- %s seconds ---" % (time.time() - start_time))
    cp.print_summary(4)
# now looking at the RemainingValue column we can see which customers would most affect our bottom line.

# Great, so we know which customers have the highest risk of churn and when they are likely to, but what can we do?
# lets take a look at our coefficients from earlier.
# we can see that the features that impact survival positively are 'Contract_One year', 'Contract_Two year',
# 'PaymentMethod_Bank transfer (automatic)', 'PaymentMethod_Credit card (automatic)'. Beyond these the results are
# insignificant. Lets compare customers with the features to understand the best place to spend money.
upgrades = ['Contract_One year',
            'Contract_Two year',
            'PaymentMethod_Bank transfer (automatic)',
            'PaymentMethod_Credit card (automatic)']
results_dict = {}
for customer in tqdm(values.index):
	actual = data.loc[[customer]]
	change = data.loc[[customer]]
	results_dict[customer] = [cph.predict_median(actual)]
	for upgrade in upgrades:
		change[upgrade] = 1 if list(change[upgrade]) == [0] else 0
		results_dict[customer].append(cph.predict_percentile(actual, p=likelihood_cutoff))
		change[upgrade] = 1 if list(change[upgrade]) == [0] else 0
results_df = pd.DataFrame(results_dict).T
results_df.columns = ['baseline'] + upgrades
actions = values.join(results_df).drop([likelihood_cutoff], axis=1)

# now we can calculate the difference between applying different features from the baseline
actions['CreditCard Diff'] = (
    actions['PaymentMethod_Credit card (automatic)'] -
    actions['baseline']
) * actions['MonthlyCharges']
actions['BankTransfer Diff'] = (
    actions['PaymentMethod_Bank transfer (automatic)'] -
Exemplo n.º 4
0
                                                   weight_delays=weight_delays)

#Create and train Cox Proportional Hazards model
cph = CoxPHFitter()
if weight_delays != None:
    cph.fit(n_train,
            duration_col='Target',
            weights_col='Weights',
            show_progress=True)
else:
    cph.fit(n_train, duration_col='Target', show_progress=True)
cph.print_summary()

#Quantify predictions
y_test = n_test.Target.values
y_pred = cph.predict_median(n_test).values.T[0]

#get mean average error for last 100 days
mae_test = sutils.plot_mae_vs_y_true(y_pred, y_test, 100)

#Plot results for presentation slides if necessary
if plot_result:

    #Define times at which to plot CDF
    times = np.arange(100).tolist()

    #Plot lines delimiting an interval in time and corresponding values of the CDF
    low_x = 60
    high_x = 80
    low_y = 1 - cph.predict_survival_function(n_test[70:71], [low_x])
    high_y = 1 - cph.predict_survival_function(n_test[70:71], [high_x])