def kmplot(df_high, df_low, ax): kmf_high = KaplanMeierFitter() kmf_low = KaplanMeierFitter() try: kmf_high.fit(durations=df_high.duration, event_observed=df_high.event, label='High: n = ' + str(len(df_high))) kmf_low.fit(durations=df_low.duration, event_observed=df_low.event, label="Low: n = " + str(len(df_low))) except ValueError: return ("NA", "0", "0", "0", "0") kmf_high.plot(ax=ax, color="red", show_censors=True, ci_show=False) kmf_low.plot(ax=ax, color="black", show_censors=True, ci_show=False) statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A=df_high.event, event_observed_B=df_low.event) p_value = statistics_result.p_value ax.set_xlabel('Time (months)') ax.set_ylabel('Probability') ax.text(0.95, 0.02, 'logrank P = ' + str('%.4f' % p_value), verticalalignment='bottom', horizontalalignment='right', transform=ax.transAxes, color='black', fontsize=11) plt.legend(loc=3) hm5 = kmf_high.predict(60) hm10 = kmf_high.predict(120) lm5 = kmf_low.predict(60) lm10 = kmf_low.predict(120) return (p_value, hm5, hm10, lm5, lm10)
def KM_estimate(self): kmf = KaplanMeierFitter() T = self.T kmf.fit(self.x, self.δ.astype(np.bool), alpha=self.confidence_α, timeline=T) Survival = np.array(kmf.predict(T)) self.KM = S_fun(self.x, self.M, T, Survival) self.KM.kmf = kmf self.KM.Sfun = self.KM.kmf.predict self.KM.mean = np.sum([(T[nn + 1] - T[nn]) * Survival[nn] for nn in range(len(Survival) - 1) ]).astype(float) + T[0] self.KM.σ = np.array( self.KM.kmf.survival_function_.std())[0].astype(float) self.KM.mean_σ = self.KM.σ self.KM.CI = np.array(self.KM.kmf.confidence_interval_) percents = np.array( [self.percentile(self.KM.Sfun, T, q / 100.) for q in σ_interval]) self.KM.median = self.KM.kmf.median_ self.KM.median_σ = 0.5 * np.diff(percents[1:])[0] self.current = 'KM'
def marginal(self): #reverse KaplanMeier self.data['status'] = self.data['status'].values.astype(int) ^ 1 # weights at requested times if "IPCW.times" in self.what: kmf = KaplanMeierFitter() kmf.fit(self.data['failure_time'], event_observed=self.data['status'].values, timeline=self.times) self.weights = np.round(kmf.predict(self.times), decimals=4) #self.weights = kmf.conditional_time_to_event_(self.times) # self.times = predict(fit, newdata=data, times=times, level_chaos=1, mode="matrix", type="surv") self.times = [] else: self.times = None # weights at subject specific event times if "IPCW.subject.times" in self.what: # self.subject_times = prodlim.predictSurvIndividual(fit, lag=self.lag) self.subject_times = [] else: self.subject_times = None out = { 'times': self.times, 'subject_times': self.subject_times, 'method': self.method } out = self.output(out, self.keep, self.times, self.fit, self.call) # class(out) < - "IPCW" return self.weights
def split(self,X,delta,y,cat=[],mode='train',df=[]): min_dur,max_dur = y.min(),y.max() times = np.linspace(min_dur,max_dur,100) d = delta.values kmf = KaplanMeierFitter() kmf.fit(y,1-delta) s_kmf = kmf.predict(y.squeeze()).values t_kmf = kmf.predict(times).values setattr(self,f'{mode}_times', torch.from_numpy(times.astype('float32')).float().unsqueeze(-1)) setattr(self,f'{mode}_s_kmf', torch.from_numpy(s_kmf.astype('float32')).float().unsqueeze(-1)) setattr(self,f'{mode}_t_kmf', torch.from_numpy(t_kmf.astype('float32')).float().unsqueeze(-1)) setattr(self,f'{mode}_delta', torch.from_numpy(delta.astype('float32').values).float()) setattr(self,f'{mode}_y', torch.from_numpy(y).float()) setattr(self, f'{mode}_X', torch.from_numpy(X).float()) if self.cat_cols: setattr(self, f'{mode}_cat_X', torch.from_numpy(df[cat].astype('int64').values).long())
def fun(epsilon): li = [] for kk in range(100): newdata_= laplace_mechanism(his , np.sqrt(2.0) / epsilon) newdata = [max([0.0, d]) for d in newdata_] ntime = np.asarray([]) nevent = np.asarray([]) for i in range(bins0): ntime = np.append(ntime, np.linspace(bin_edges0[i], bin_edges0[i+1] , newdata[i])) #ntime = np.append(ntime, np.ones(newdata[i]) * 0.5 * (bin_edges0[i+1] + bin_edges0[i] )) # , newdata[i])) nevent = np.append(nevent,np.zeros(newdata[i])) for i in range(bins1): ntime = np.append(ntime,np.linspace(bin_edges1[i], bin_edges1[i+1], newdata[bins0 + i])) #ntime = np.append(ntime, np.ones(newdata[bins0 + i]) * 0.5 * (bin_edges1[i+1] + bin_edges1[i] )) # , newdata[i])) nevent = np.append(nevent, np.ones(newdata[bins0+i])) kmf1 = KaplanMeierFitter() kmf1.fit(ntime, event_observed=nevent) #naf1.fit(ntime, event_observed=nevent) out = kmf1.predict(kmf.timeline) #pyplot.plot (naf1.timeline, naf1.cumulative_hazard_.values) #pyplot.plot (naf.timeline, naf.cumulative_hazard_.values) #pyplot.show() mre = ( np.linalg.norm(out - true_value[:,0]) / np.linalg.norm(true_value[:,0]) ) li.append(mre) avg = np.average( li ) #mean_relative_error.append(avg) print "(%f, %f)" % (epsilon, avg)
def kmplot(df_high, df_low): kmf_high = KaplanMeierFitter() kmf_low = KaplanMeierFitter() try: kmf_high.fit(durations = df_high.duration, event_observed = df_high.event, label = 'High: n = ' + str(len(df_high))) kmf_low.fit(durations = df_low.duration, event_observed = df_low.event, label = "Low: n = " + str(len(df_low))) except ValueError: return("NA", "0", "0", "0", "0") statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A = df_high.event, event_observed_B = df_low.event) p_value = statistics_result.p_value hm5 = kmf_high.predict(60) hm10 = kmf_high.predict(120) lm5 = kmf_low.predict(60) lm10 = kmf_low.predict(120) return(p_value, hm5, hm10, lm5, lm10)
def get_censoring_dist(train_dataset): _dataset = train_dataset.dataset times, event_observed = [d['time_at_event'] for d in _dataset], [d['y'] for d in _dataset] all_observed_times = set(times) kmf = KaplanMeierFitter() kmf.fit(times, event_observed) censoring_dist = {time: kmf.predict(time) for time in all_observed_times} return censoring_dist
def predict(self, test_x, time_list): """ for each test datapoint, find the k nearest neighbors, and use them to fit a Kaplan-Meier Model to get the survival function, and then use the survival function the calculate the median survival time :param test_df: DataFrame :param time_list: checkpoint time to calculate probability on :return: the list of median survival time and the probability matrix """ test_df = pd.DataFrame(data=test_x, columns=self._feature_names) test_df = self._standardize_df(test_df, flag='test') reduced_test_df = self._test_pca(test_df) test_x = reduced_test_df.values # calculate distance matrix to find the nearest neighbors distance_matrix, neighbor_matrix = \ self.neighbors.kneighbors( X=test_x, n_neighbors=int(np.min([self.n_neighbors, self.train_points])) ) proba_matrix = [] test_time_median_pred = [] for test_idx, test_point in enumerate(test_x): # find the k nearest neighbors neighbor_train_y = \ self.train_df.iloc[neighbor_matrix[test_idx]][ [self._duration_col, self._event_col] ] kmf = KaplanMeierFitter() kmf.fit(neighbor_train_y[self._duration_col], neighbor_train_y[self._event_col]) survival_proba = kmf.predict(time_list) # calculate the median survival time. # the median survival time is the time at which the survival proba. # equals to 0.5. Here the survival_proba is descending sorted from # 1 to 0, so we only need to find the first probability that <= 0.5 median_time = np.max(time_list) for col, proba in enumerate(survival_proba): if proba > 0.5: continue if proba == 0.5: median_time = time_list[col] else: # here we take the average of the time before and after # Pr = 0.5 median_time = (time_list[col - 1] + time_list[col]) / 2 break test_time_median_pred.append(median_time) proba_matrix.append(survival_proba) return np.array(test_time_median_pred), \ pd.DataFrame(np.transpose(np.array(proba_matrix)), index=np.array(time_list))
def kmplot(df_high, df_low, ax): kmf_high = KaplanMeierFitter() kmf_low = KaplanMeierFitter() try: kmf_high.fit(durations = df_high.duration, event_observed = df_high.event, label = 'High: n = ' + str(len(df_high))) kmf_low.fit(durations = df_low.duration, event_observed = df_low.event, label = "Low: n = " + str(len(df_low))) except ValueError: return("NA", "0", "0", "0", "0") kmf_high.plot(ax = ax, color = "red", show_censors=True, ci_show=False) kmf_low.plot(ax = ax, color = "black", show_censors=True, ci_show=False) statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A = df_high.event, event_observed_B = df_low.event) p_value = statistics_result.p_value ax.set_xlabel('Time (months)') ax.set_ylabel('Probability') ax.text(0.95, 0.02, 'logrank P = ' + str('%.4f' % p_value), verticalalignment='bottom', horizontalalignment='right', transform=ax.transAxes, color = 'black', fontsize = 11) plt.legend(loc=3) hm5 = kmf_high.predict(60) hm10 = kmf_high.predict(120) lm5 = kmf_low.predict(60) lm10 = kmf_low.predict(120) return(p_value, hm5, hm10, lm5, lm10)
def binomial_log_likelihood_km(times, prob_alive, durations, events, eps=1e-7): '''Compute the binomial log-likelihood for survival at given times. We compute binomial log-likelihood weighted by the inverse censoring distribution. This is the same weighting scheeme as for the brier score. Parameters: times: Number or iterable with times where to compute the brier scores. prob_alive: Numpy array [len(times), len(durations)] with the estimated probabilities of each individual to be alive at each time in `times`. Each row represets a time in input array `times`. durations: Numpy array with time of events. events: Boolean numpy array indecating if dead/censored (True/False). eps: Clip prob_alive at (eps, 1-eps). Returns: Numpy array with brier scores. ''' from lifelines import KaplanMeierFitter if not hasattr(times, '__iter__'): times = [times] assert prob_alive.__class__ is np.ndarray, 'Need numpy array' assert prob_alive.shape == (len(times), len(durations)),\ 'Need prob_alive to have dims [len(times), len(durations)].' kmf_censor = KaplanMeierFitter() kmf_censor.fit(durations, 1 - events) km_censor_at_durations = kmf_censor.survival_function_.loc[ durations].values.flatten() km_censor_at_times = kmf_censor.predict(times) prob_alive = np.clip(prob_alive, eps, 1 - eps) def compute_score(time_, km_censor_at_time, prob_alive_): died = ((durations <= time_) & (events == True)) survived = (durations > time_) event_part = np.log(1 - prob_alive_[died]) / km_censor_at_durations[died] survived_part = np.log(prob_alive_[survived]) / km_censor_at_time return (np.sum(event_part) + np.sum(survived_part)) / len(durations) scores = [ compute_score(time_, km, pa) for time_, km, pa in zip(times, km_censor_at_times, prob_alive) ] return np.array(scores)
def brier_score_km(times, prob_alive, durations, events): '''Compute the brier scores (for survival) at given times. For a specification on brier scores for survival data see e.g.: "Assessment of evaluation criteria for survival prediction from genomic data" by Bovelstad and Borgan. Parameters: times: Number or iterable with times where to compute the brier scores. prob_alive: Numpy array [len(times), len(durations)] with the estimated probabilities of each individual to be alive at each time in `times`. Each row represets a time in input array `times`. durations: Numpy array with time of events. events: Boolean numpy array indecating if dead/censored (True/False). Returns: Numpy array with brier scores. ''' from lifelines import KaplanMeierFitter if not hasattr(times, '__iter__'): times = [times] assert prob_alive.__class__ is np.ndarray, 'Need numpy array' assert prob_alive.shape == (len(times), len(durations)),\ 'Need prob_alive to have dims [len(times), len(durations)].' kmf_censor = KaplanMeierFitter() kmf_censor.fit(durations, 1 - events) # km_censor_at_durations = kmf_censor.predict(durations) km_censor_at_durations = kmf_censor.survival_function_.loc[ durations].values.flatten() km_censor_at_times = kmf_censor.predict(times) def compute_score(time_, km_censor_at_time, prob_alive_): died = ((durations <= time_) & (events == True)) survived = (durations > time_) event_part = (prob_alive_**2)[died] / km_censor_at_durations[died] survived_part = ((1 - prob_alive_)**2)[survived] / km_censor_at_time return (np.sum(event_part) + np.sum(survived_part)) / len(durations) b_scores = [ compute_score(time_, km, pa) for time_, km, pa in zip(times, km_censor_at_times, prob_alive) ] return np.array(b_scores)
ridge_brier_scores = [] kmf = KaplanMeierFitter() kmf.fit( event_time_train, event_observed=event_indicator_train, timeline=eval_times_brier_score, ) for s in np.unique(strata): strata_train_dat = survival_data_train[strata_train == s] strata_test_dat = survival_data_test[strata_test == s] kaplan_preds = np.repeat( [kmf.predict(eval_times_brier_score).to_numpy()], strata_test_dat.shape[0], axis=0, ) times, km_score = brier_score( survival_train=strata_train_dat, survival_test=strata_test_dat, estimate=kaplan_preds, times=eval_times_brier_score, ) kaplan_brier_scores.append(km_score) kaplan_group_sizes.append(strata_test_dat.shape[0]) kmf_brier_scores = np.average(np.stack(kaplan_brier_scores),
Female.head() #Fit data into objects: kmf_m.fit(durations=Male["time"], event_observed=Male["dead"], label="Male") kmf_f.fit(durations=Female["time"], event_observed=Female["dead"], label="Female") #Event table for male group: kmf_m.event_table #Event table for female group: kmf_f.event_table #Predict value based on time: kmf_m.predict(11) #Predict value based on time: kmf_f.predict(11) #Get complete data of survival function for male group: kmf_m.survival_function_ #Get complete data of survival function for female group: kmf_f.survival_function_ #Plot the survival_function data: kmf_m.plot() kmf_f.plot() plt.xlabel("Days Passed") plt.ylabel("Survival Probability")
Female.head() #Fit data into objects: km_m.fit(durations=Male["time"], event_observed=Male["dead"], label="Male") km_f.fit(durations=Female["time"], event_observed=Female["dead"], label="Female") #Event table for male group: km_m.event_table #Event table for female group: km_f.event_table #Predict value based on time: km_m.predict(11) #Predict value based on time: km_f.predict(11) #Get complete data of survival function for male group: km_m.survival_function_ #Get complete data of survival function for female group: km_f.survival_function_ #Plot the survival_function data: km_m.plot() km_f.plot() plt.xlabel("Days Passed") plt.ylabel("Survival Probability")
def predictNextDeal(targetGameDetails, publisherGameDetails, genreGameDetails, filledHistories, validate=False): #%% #------------------------------------------------------------------------------ # parse inputs for needed data formats #------------------------------------------------------------------------------ # get target game information ind = targetGameDetails.index.tolist() targetReleaseDate = targetGameDetails['Release Data'][ind[0]] # get price history information targetHistory = filledHistories[0] publisherHistory = filledHistories[1] genreHistory = filledHistories[2] #%% #------------------------------------------------------------------------------ # feature engineering: target price history # 1) transform dates to days since release # 2) find time when deals occur and duration from last deal's end to current deal's start #------------------------------------------------------------------------------ target_dur_until_deal = [] targetDaysSinceRelease = [] if len(targetHistory) > 0: # Part 1 - transform dates to days since release # convert timestamps to dates targetGameDates = [] for ts in targetHistory['timestamps']: targetGameDates.append(datetime.datetime.fromtimestamp(ts).date()) # convert dates --> days since release # transform specific dates to time deltas (in days) from release date # i.e days from release_date to timepoint1, release_date to timepoint2, etc. try: targetfirstdayDT = datetime.datetime.strptime(targetReleaseDate, '%b %d, %Y').date() except: try: targetfirstdayDT = datetime.datetime.strptime(targetReleaseDate, '%b %Y').date() except: targetfirstdayDT = parse(targetReleaseDate, ignoretz=True).date() for i in range(len(targetGameDates)): delta = targetGameDates[i] - targetfirstdayDT targetDaysSinceRelease.append(delta.days) # plt.figure() # plt.scatter(targetDaysSinceRelease, targetHistory.iloc[:,1]) # plt.title("Price History " + targetName, fontsize=50) # plt.xlabel("Days Since Release", fontsize=30) # plt.ylabel("Price", fontsize=30) # ax = plt.gca() # ax.tick_params(axis = 'both', which = 'major', labelsize = 20) # ax.tick_params(axis = 'both', which = 'minor', labelsize = 12) # Part 2 - find time when deals occur and the duration from last deal's end # find change in target price history time series targetPriceDiff = targetHistory.iloc[:,1].diff() # combine days since release and price history change into a matrix targetPriceChanges = np.zeros([len(targetPriceDiff), 2]) for i in range(len(targetPriceChanges)): tempDate = targetDaysSinceRelease[i] tempPrice = targetPriceDiff.iloc[i] targetPriceChanges[i,:] = [tempDate, tempPrice] # remove first row with nan priceChanges = targetPriceChanges[1:,:] # remove rows with zeros priceChanges = priceChanges[~(priceChanges==0).any(1),:] # markers for when deals start targetDealStart = priceChanges[(priceChanges<0).any(1),:] # markers for when deals end targetDealEnd = priceChanges[~(priceChanges<0).any(1),:] firstRow = targetPriceChanges[0,:] targetDealEnd = np.vstack([firstRow, targetDealEnd]) # find duration between deals for i in range(len(targetDealStart[:,0])): # find index of day for current deal dealDay = targetDealStart[i, 0] daysColumn = targetPriceChanges[:,0] dealDayInd = np.where(daysColumn == dealDay)[0][0] # find day for when last normal price began temp = targetPriceChanges[:int(dealDayInd),:] previousNonZero = temp[(temp[:,1]!=0),:] originDay = previousNonZero[-1][0] # find time elapsed since last deal ended duration = dealDay - originDay target_dur_until_deal = np.hstack([target_dur_until_deal, duration]) # remove first row with zero initialization target_dur_until_deal = target_dur_until_deal[1:] #%% #------------------------------------------------------------------------------ # feature engineering: publisher price history # 1) transform dates to days since release # 2) find time when deals occur and duration from last deal's end to current deal's start #------------------------------------------------------------------------------ # Part 1 - transform dates to days since release # convert dates --> days since release # transform specific dates to time deltas (in days) from release date # i.e days from release_date to timepoint1, release_date to timepoint2, etc. publisherDaysSinceRelease = [] # make sure there are still other publisher games if len(publisherHistory) > 0: publisherGameTitles = [] for ind in range(len(publisherHistory)): dealDates = [] for ts in publisherHistory[ind]['timestamps']: dealDates.append(datetime.datetime.fromtimestamp(ts).date()) # get first day columnnames = list(publisherHistory[ind].columns.values) ind2 = publisherGameDetails.loc[publisherGameDetails['Game Title'] == columnnames[1]].index.tolist() firstday = publisherGameDetails['Release Data'][ind2[0]] try: firstdayDT = datetime.datetime.strptime(firstday, '%b %d, %Y').date() except: try: firstdayDT = datetime.datetime.strptime(firstday, '%d %b, %Y').date() except: try: firstdayDT = datetime.datetime.strptime(firstday, '%B %dth, %Y').date() except: try: firstdayDT = datetime.datetime.strptime(firstday, '%b %Y').date() except: firstdayDT = parse(firstday, ignoretz=True).date() # calculate time delta from release date gameDelta = [] for i in range(len(dealDates)): timeDelta = dealDates[i] - firstdayDT gameDelta.append(timeDelta.days) publisherGameTitles.append(columnnames[1]) publisherDaysSinceRelease.append(gameDelta) # visualize publisher price history x days since release # for ind in range(len(publisherHistory)): # plt.scatter(publisherDaysSinceRelease[ind], publisherHistory[ind][publisherGameTitles[ind]]) ## plt.title("Price History for Ubisoft Games", fontsize=50) # plt.title("Price History for " + publisherGameTitles[ind], fontsize=50) # plt.xlabel("Days Since Release", fontsize=30) # plt.ylabel("Price", fontsize=30) # ax = plt.gca() # ax.tick_params(axis = 'both', which = 'major', labelsize = 20) # ax.tick_params(axis = 'both', which = 'minor', labelsize = 12) # ax.set_ylim([0, 70]) # Part 2 - find time when deals occur and the duration from last deal's end publisher_dur_until_deal = [] if len(publisherHistory) > 0: for ind in range(len(publisherHistory)): # find change in publisher price history time series publisherPriceDiff = publisherHistory[ind].iloc[:,1].diff() # combine days since release and price history change into a matrix publisherPriceChanges = np.zeros([len(publisherPriceDiff), 2]) for i in range(len(publisherPriceChanges)): tempDate = publisherDaysSinceRelease[ind][i] tempPrice = publisherPriceDiff.iloc[i] publisherPriceChanges[i,:] = [tempDate, tempPrice] # remove first row with nan priceChanges = publisherPriceChanges[1:,:] # remove rows with zeros priceChanges = priceChanges[~(priceChanges==0).any(1),:] # markers for when deals start publisherDealStart = priceChanges[(priceChanges<0).any(1),:] # markers for when deals end publisherDealEnd = priceChanges[~(priceChanges<0).any(1),:] firstRow = publisherPriceChanges[0,:] publisherDealEnd = np.vstack([firstRow, publisherDealEnd]) # find duration between deals temp_dur_until_deal = np.zeros(1) for i in range(len(publisherDealStart[:,0])): # find index of day for current deal dealDay = publisherDealStart[i, 0] daysColumn = publisherPriceChanges[:,0] dealDayInd = np.where(daysColumn == dealDay)[0][0] # find day for when last normal price began temp = publisherPriceChanges[:int(dealDayInd),:] previousNonZero = temp[(temp[:,1]!=0),:] originDay = previousNonZero[-1][0] # find time elapsed since last deal ended duration = dealDay - originDay temp_dur_until_deal = np.hstack([temp_dur_until_deal, duration]) # remove first row with zero initialization temp_dur_until_deal = temp_dur_until_deal[1:] # stack publisher durations in matrix publisher_dur_until_deal.append(temp_dur_until_deal) #%% #------------------------------------------------------------------------------ # feature engineering: genre price history # 1) transform dates to days since release # 2) find time when deals occur and duration from last deal's end to current deal's start #------------------------------------------------------------------------------ # Part 1 - transform dates to days since release # convert dates --> days since release # transform specific dates to time deltas (in days) from release date # i.e days from release_date to timepoint1, release_date to timepoint2, etc. genreDaysSinceRelease = [] # make sure there are still other publisher games if len(genreHistory) > 0 and len(publisherHistory) < 1: genreGameTitles = [] for ind in range(len(genreHistory)): dealDates = [] for ts in genreHistory[ind]['timestamps']: dealDates.append(datetime.datetime.fromtimestamp(ts).date()) # get first day columnnames = list(genreHistory[ind].columns.values) ind2 = genreGameDetails.loc[genreGameDetails['Game Title'] == columnnames[1]].index.tolist() firstday = genreGameDetails['Release Data'][ind2[0]] try: firstdayDT = datetime.datetime.strptime(firstday, '%b %d, %Y').date() except: try: firstdayDT = datetime.datetime.strptime(firstday, '%d %b, %Y').date() except: try: firstdayDT = datetime.datetime.strptime(firstday, '%B %dth, %Y').date() except: try: firstdayDT = datetime.datetime.strptime(firstday, '%b %Y').date() except: firstdayDT = parse(firstday, ignoretz=True).date() # calculate time delta from release date gameDelta = [] for i in range(len(dealDates)): timeDelta = dealDates[i] - firstdayDT gameDelta.append(timeDelta.days) genreGameTitles.append(columnnames[1]) genreDaysSinceRelease.append(gameDelta) ## visualize publisher price history x days since release # for ind in range(len(publisherHistory)): # plt.scatter(publisherDaysSinceRelease[ind], publisherHistory[ind][publisherGameTitles[ind]]) # plt.title("Price History for Ubisoft Games", fontsize=50) # plt.xlabel("Days Since Release", fontsize=30) # plt.ylabel("Price", fontsize=30) # ax = plt.gca() # ax.tick_params(axis = 'both', which = 'major', labelsize = 20) # ax.tick_params(axis = 'both', which = 'minor', labelsize = 12) # Part 2 - find time when deals occur and the duration from last deal's end genre_dur_until_deal = [] if len(genreHistory) > 0 and len(publisherHistory) < 1: for ind in range(len(genreHistory)): # find change in publisher price history time series genrePriceDiff = genreHistory[ind].iloc[:,1].diff() # combine days since release and price history change into a matrix genrePriceChanges = np.zeros([len(genrePriceDiff), 2]) for i in range(len(genrePriceChanges)): tempDate = genreDaysSinceRelease[ind][i] tempPrice = genrePriceDiff.iloc[i] genrePriceChanges[i,:] = [tempDate, tempPrice] # remove first row with nan priceChanges = genrePriceChanges[1:,:] # remove rows with zeros priceChanges = priceChanges[~(priceChanges==0).any(1),:] # markers for when deals start genreDealStart = priceChanges[(priceChanges<0).any(1),:] # markers for when deals end genreDealEnd = priceChanges[~(priceChanges<0).any(1),:] firstRow = genrePriceChanges[0,:] genreDealEnd = np.vstack([firstRow, genreDealEnd]) # find duration between deals temp_dur_until_deal = np.zeros(1) for i in range(len(genreDealStart[:,0])): # find index of day for current deal dealDay = genreDealStart[i, 0] daysColumn = genrePriceChanges[:,0] dealDayInd = np.where(daysColumn == dealDay)[0][0] # find day for when last normal price began temp = genrePriceChanges[:int(dealDayInd),:] previousNonZero = temp[(temp[:,1]!=0),:] originDay = previousNonZero[-1][0] # find time elapsed since last deal ended duration = dealDay - originDay temp_dur_until_deal = np.hstack([temp_dur_until_deal, duration]) # remove first row with zero initialization temp_dur_until_deal = temp_dur_until_deal[1:] # stack publisher durations in matrix genre_dur_until_deal.append(temp_dur_until_deal) #%% #------------------------------------------------------------------------------ # machine learning: probability of deal #------------------------------------------------------------------------------ if validate is True and len(targetHistory) > 0: if len(target_dur_until_deal) > 1: # choose random target deal that has already occurred # testDuration = random.choice(target_dur_until_deal) # choose last target deal that has already occurred testDuration = target_dur_until_deal[-1] # get row index of target price in target unique price matrix ind = np.where(target_dur_until_deal == testDuration)[0][0] # remove target price and prices after it target_dur_until_deal = np.delete(target_dur_until_deal, np.s_[ind]) else: testDuration = np.NaN else: testDuration = np.NaN # combine all data points into single array all_dur_until_deal = np.zeros(1) # initialize array if len(target_dur_until_deal) > 0: all_dur_until_deal = target_dur_until_deal if len(publisher_dur_until_deal) > 0: for ind in range(len(publisher_dur_until_deal)): all_dur_until_deal = np.hstack((all_dur_until_deal, publisher_dur_until_deal[ind])) if len(genre_dur_until_deal) > 0: for ind in range(len(genre_dur_until_deal)): all_dur_until_deal = np.hstack((all_dur_until_deal, genre_dur_until_deal[ind])) # create event array all_event = np.ones(all_dur_until_deal.shape) # get days since last target deal if len(targetDaysSinceRelease) > 0: censoredNextDealDur = targetDaysSinceRelease[-1] - targetDealEnd[-1][0] else: censoredNextDealDur = 0 # add days since last target deal to duration and event matrices all_dur_until_deal = np.append(all_dur_until_deal, censoredNextDealDur) all_event = np.append(all_event, 0) # transform to Pandas dataframe for fitting tempmat = np.vstack((all_dur_until_deal, all_event)).T columns = ['Duration', 'Event'] duration = pd.DataFrame(tempmat, columns=columns) # estimate survival function using Kaplan-Meier estimator kmf = KaplanMeierFitter() kmf.fit(duration['Duration'], event_observed=duration['Event']) # visualize probability of deals # kmf.survival_function_.plot(linewidth=3.3) # plt.title('Survival Function of No Deals (Activision Games)', fontsize=50) # plt.xlabel("Days Since Last Deal", fontsize=30) # plt.ylabel("Probability No Deal Tomorrow", fontsize=30) # ax = plt.gca() # ax.xaxis.set_label_coords(0.5,-.07) # ax.yaxis.set_label_coords(-.05,0.5) # ax.tick_params(axis = 'both', which = 'major', labelsize = 20) # ax.tick_params(axis = 'both', which = 'minor', labelsize = 12) # ax.set_xlim([0, 160]) # probability a deal will occur tomorrow probDealTomorrow = 1 - kmf.predict(censoredNextDealDur) # when high probability (0.80) deal will occur tomorrow try: kmf_survival = kmf.survival_function_ threshold = kmf_survival[kmf_survival['KM_estimate'] < .10] nextHighProbDay = threshold.index[0] nextHighProbDay = nextHighProbDay - censoredNextDealDur except: nextHighProbDay = np.NaN # Cox Proportional Hazard Model (regression model for future) # cph = CoxPHFitter() # cph.fit(duration, duration_col=0, show_progress=True) # cph.print_summary() # access the results using cph.summary # # cph.predict_partial_hazard(duration) # cph.predict_survival_function(duration, times=[5., 25., 50.]) # cph.predict_median(duration) #%% return list([probDealTomorrow, nextHighProbDay, testDuration])
event_at_0 = kmf.event_table.iloc[0, :] # now calculate the survival probability for t = 0 surv_for_0 = (event_at_0.at_risk - event_at_0.observed) / event_at_0.at_risk # Calculate the survival probability for t = 1 event_at_1 = kmf.event_table.iloc[1, :] surv_for_1 = (event_at_1.at_risk - event_at_1.observed) / event_at_1.at_risk # Calculate the survival probability for t = 2 event_at_2 = kmf.event_table.iloc[2, :] surv_for_2 = (event_at_2.at_risk - event_at_2.observed) / event_at_2.at_risk # The probability that an NFL player has a career longer than 2 years surv_after_2 = surv_for_0 * surv_for_1 * surv_for_2 kmf.predict(2) # The survival probabilities of NFL players after 1, 3, 5, and 10 yrs played kmf.predict([1,3,5,10]) kmf.survival_function_ kmf.median_ # plot the KM estimate kmf.plot() # Add title and y-axis label plt.title("The Kaplan-Meier Estimate for Drafted NFL Players\n(1967-2015)") plt.ylabel("Probability a Player is Still Active") plt.show()
print(kmf_train.median_survival_time_) print(kmf_test.median_survival_time_) # In[28]: print(median_survival_times(kmf_train.confidence_interval_)) print(median_survival_times(kmf_test.confidence_interval_)) # In[29]: print(kmf_train.event_table) print(kmf_test.event_table) # In[30]: print('Survival probability for t=60 for train set: ', kmf_train.predict(60)) print('Survival probability for t=60 for test set: ', kmf_test.predict(60)) # In[31]: results = logrank_test(train['PFS'], test['PFS'], train['disease_progress'], test['disease_progress'], alpha=.95) results.print_summary() # In[ ]: # In[32]:
#Calculating the actual survival probability at a given time: surv_after_0 = surv_for_0 print("Survival Probability After 0 Days: ",surv_after_0) #Calculating the actual survival probability at a given time: surv_after_5 = surv_for_0 * surv_for_5 print("Survival Probability After 5 Days: ",surv_after_5) #Calculating the actual survival probability at a given time:surv_after_11 = surv_for_0 * surv_for_5 * surv_for_11 print("Survival Probability After 11 Days: ",surv_after_11) #Get the probability values the easy way! print("Survival probability for t=0: ",kmf.predict(0)) print("Survival probability for t=5: ",kmf.predict(5)) print("Survival probability for t=11: ",kmf.predict(11)) #Predicting the surviaval probability for an array of value: kmf.predict([0,5,11,12]) #To get the full list: kmf.survival_function_ #Plot the graph: kmf.plot() plt.title("The Kaplan-Meier Estimate") plt.xlabel("Number of days") plt.ylabel("Probability of survival")
def _calibration_curve_ipcw(out, e, t, a, group, eval_time, typ, ret_bins=True, strat='quantile', n_bins=10): """Returns the Calibration curve and the bins given some risk scores. Accepts the output of a trained survival model at a certain evaluation time, the event indicators and protected group membership and outputs an IPCW adjusted calibration curve. Args: out: risk scores P(T>t) issued by a trained survival analysis model (output of fair_survival_analysis.models.predict_survival). e: a numpy vector of indicators specifying is event or censoring occured. t: a numpy vector of times at which the events or censoring occured. a: a numpy vector of protected attributes. group: string indicating the demogrpahic to evaluate calibration for. eval_time: float/int of the event time at which calibration is to be evaluated. Must be same as the time at which the Risk Scores were issues. typ: Determines if the calibration curves are to be computed on the individuals that experienced the event or adjusted estimates for individuals that are censored using IPCW estimator on a population or subgroup level ret_bins: Boolean that specifies if the bins of the calibration curve are to be returned. strat: Specifies how the bins are computed. One of: "quantile": Equal sized bins. "uniform": Uniformly stratified. n_bins: int specifying the number of bins to use to compute the ece. Returns: Calibration Curve: A tuple of True Probality, Estimated Probability in each bin and the estimated Expected Calibration Error. """ if typ == 'IPCWpop': kmf = KaplanMeierFitter().fit(t, 1 - e) else: t_ = t[a == group] e_ = e[a == group] kmf = KaplanMeierFitter().fit(t_, 1 - e_) out_ = out.copy() e = e[a == group] t = t[a == group] out = out[a == group] y = t > eval_time if strat == 'quantile': quantiles = [(1. / n_bins) * i for i in range(n_bins + 1)] outbins = np.quantile(out, quantiles) if strat == 'uniform': binlen = (out.max() - out.min()) / n_bins outbins = [out.min() + i * binlen for i in range(n_bins + 1)] prob_true = [] prob_pred = [] ece = 0 for n_bin in range(n_bins): binmin = outbins[n_bin] binmax = outbins[n_bin + 1] scorebin = (out >= binmin) & (out <= binmax) weight = float(len(scorebin)) / len(out) out_ = out[scorebin] y_ = y[scorebin] y_ = y_ / kmf.predict(eval_time) pred = y_.mean() prob_true.append(pred) prob_pred.append(out_.mean()) gap = abs(prob_pred[-1] - prob_true[-1]) ece += weight * gap if ret_bins: return prob_true, prob_pred, outbins, ece else: return prob_true, prob_pred, ece
else: cls = 'Nonresponder' month_dic[cls].append(surDic[pat]['months']) status_dic[cls].append(surDic[pat]['status']) # logrank Test results = logrank_test(month_dic['Responder'], month_dic['Nonresponder'], event_observed_A=status_dic['Responder'], event_observed_B=status_dic['Nonresponder']) pvalue = results.p_value for cls in month_dic: kmf = KaplanMeierFitter() kmf.fit(month_dic[cls], status_dic[cls]) fiveYear_dic[cls] = kmf.predict(60) # draw survival plot f = plt.figure(figsize=(4, 4)) ax = f.add_subplot(1, 1, 1) plt.title('%s / %s / %s / %s\npvalue=%.4f\n' % (cancer_type, drug, ML, testing_pathway_rank, pvalue), fontsize=8) c1 = KaplanMeierFitter() ax = c1.fit(month_dic['Responder'], status_dic['Responder'], label='Responder (n=%s)' % len(month_dic['Responder'])).plot(ax=ax, ci_show=True, c='r')
######## Survival probability at t=0 only event_at_0 = kmf.event_table.iloc[0, :] survival_for_0 = (event_at_0.at_risk - event_at_0.observed) / event_at_0.at_risk print("Surival probability at time 0 only is : ", survival_for_0) ######## Survival probability at t=5 only event_at_5 = kmf.event_table.iloc[1, :] survival_for_5 = (event_at_5.at_risk - event_at_5.observed) / event_at_5.at_risk print("Surival probability at time 5 only is : ", survival_for_5) ######## Survival probability at t=13 only event_at_13 = kmf.event_table.iloc[4, :] survival_for_13 = (event_at_13.at_risk - event_at_13.observed) / event_at_13.at_risk print("Surival probability at time 13 only is : ", survival_for_13) ##### Survival probability probability after 5 days (for t= 5) survival_after_5 = survival_for_0 * survival_for_5 print("\nSurvival Probability after 5 days : ", survival_after_5) #### Automate the work we've done above print("\nSurvival Probability after 5 days : ", kmf.predict(5)) print("Survival Probability after 3 days : ", kmf.predict(13)) print("Survival Probability after 1022 days : ", kmf.predict(1022)) #### Survival probability for whole timeline print("\n", kmf.survival_function_)
def survival_difference_at_fixed_point_in_time_test( point_in_time, durations_A, durations_B, event_observed_A=None, event_observed_B=None, **kwargs) -> StatisticalResult: """ Often analysts want to compare the survival-ness of groups at specific times, rather than comparing the entire survival curves against each other. For example, analysts may be interested in 5-year survival. Statistically comparing the naive Kaplan-Meier points at a specific time actually has reduced power (see [1]). By transforming the Kaplan-Meier curve, we can recover more power. This function uses the log(-log) transformation. Parameters ---------- point_in_time: float, the point in time to analyze the survival curves at. durations_A: iterable a (n,) list-like of event durations (birth to death,...) for the first population. durations_B: iterable a (n,) list-like of event durations (birth to death,...) for the second population. event_observed_A: iterable, optional a (n,) list-like of censorship flags, (1 if observed, 0 if not), for the first population. Default assumes all observed. event_observed_B: iterable, optional a (n,) list-like of censorship flags, (1 if observed, 0 if not), for the second population. Default assumes all observed. kwargs: add keywords and meta-data to the experiment summary Returns ------- StatisticalResult a StatisticalResult object with properties ``p_value``, ``summary``, ``test_statistic``, ``print_summary`` Examples -------- .. code:: python T1 = [1, 4, 10, 12, 12, 3, 5.4] E1 = [1, 0, 1, 0, 1, 1, 1] T2 = [4, 5, 7, 11, 14, 20, 8, 8] E2 = [1, 1, 1, 1, 1, 1, 1, 1] from lifelines.statistics import survival_difference_at_fixed_point_in_time_test results = survival_difference_at_fixed_point_in_time_test(12, T1, T2, event_observed_A=E1, event_observed_B=E2) results.print_summary() print(results.p_value) # 0.893 print(results.test_statistic) # 0.017 Notes ----- Other transformations are possible, but Klein et al. [1] showed that the log(-log(c)) transform has the most desirable statistical properties. References ----------- [1] Klein, J. P., Logan, B. , Harhoff, M. and Andersen, P. K. (2007), Analyzing survival curves at a fixed point in time. Statist. Med., 26: 4505-4519. doi:10.1002/sim.2864 """ kmfA = KaplanMeierFitter().fit(durations_A, event_observed=event_observed_A) kmfB = KaplanMeierFitter().fit(durations_B, event_observed=event_observed_B) sA_t = kmfA.predict(point_in_time) sB_t = kmfB.predict(point_in_time) # this is doing a prediction/interpolation between the kmf's index. sigma_sqA = interpolate_at_times_and_return_pandas(kmfA._cumulative_sq_, point_in_time) sigma_sqB = interpolate_at_times_and_return_pandas(kmfB._cumulative_sq_, point_in_time) log = np.log clog = lambda s: log(-log(s)) X = (clog(sA_t) - clog(sB_t))**2 / (sigma_sqA / log(sA_t)**2 + sigma_sqB / log(sB_t)**2) p_value = _chisq_test_p_value(X, 1) return StatisticalResult( p_value, X, null_distribution="chi squared", degrees_of_freedom=1, point_in_time=point_in_time, test_name="survival_difference_at_fixed_point_in_time_test", **kwargs)
) ## group i2 , having the pandas series for the 2nd cohort ## fit the model for 1st cohort kmf.fit(T[i1], E[i1], label='No Partner') a1 = kmf.plot() ## fit the model for 2nd cohort kmf.fit(T[i2], E[i2], label='Partner') kmf.plot(ax=a1) #### 3 new cohorts are compared # 1. Contract type is month-to-month # 2. Contract type is Two # 2. Contract type is One year groups = input_df['Contract'] ## Create the cohorts from the 'Contract' column ix1 = (groups == 'Month-to-month') ## Cohort 1 ix2 = (groups == 'Two year') ## Cohort 2 ix3 = (groups == 'One year') ## Cohort 3 kmf.fit(T[ix1], E[ix1], label='Month-to-month') ## fit the cohort 1 data ax = kmf.plot() kmf.fit(T[ix2], E[ix2], label='Two year') ## fit the cohort 2 data ax1 = kmf.plot(ax=ax) kmf.fit(T[ix3], E[ix3], label='One year') ## fit the cohort 3 data kmf.plot(ax=ax1) ## Plot the KM curve for three cohort on same x and y axis print(kmf.predict(T[0])) plt.show()