示例#1
0
def kmplot(df_high, df_low, ax):
    kmf_high = KaplanMeierFitter()
    kmf_low = KaplanMeierFitter()
    try:
        kmf_high.fit(durations=df_high.duration,
                     event_observed=df_high.event,
                     label='High: n = ' + str(len(df_high)))
        kmf_low.fit(durations=df_low.duration,
                    event_observed=df_low.event,
                    label="Low: n = " + str(len(df_low)))
    except ValueError:
        return ("NA", "0", "0", "0", "0")
    kmf_high.plot(ax=ax, color="red", show_censors=True, ci_show=False)
    kmf_low.plot(ax=ax, color="black", show_censors=True, ci_show=False)
    statistics_result = logrank_test(df_high.duration,
                                     df_low.duration,
                                     event_observed_A=df_high.event,
                                     event_observed_B=df_low.event)
    p_value = statistics_result.p_value
    ax.set_xlabel('Time (months)')
    ax.set_ylabel('Probability')
    ax.text(0.95,
            0.02,
            'logrank P = ' + str('%.4f' % p_value),
            verticalalignment='bottom',
            horizontalalignment='right',
            transform=ax.transAxes,
            color='black',
            fontsize=11)
    plt.legend(loc=3)
    hm5 = kmf_high.predict(60)
    hm10 = kmf_high.predict(120)
    lm5 = kmf_low.predict(60)
    lm10 = kmf_low.predict(120)
    return (p_value, hm5, hm10, lm5, lm10)
示例#2
0
    def KM_estimate(self):
        kmf = KaplanMeierFitter()
        T = self.T
        kmf.fit(self.x,
                self.δ.astype(np.bool),
                alpha=self.confidence_α,
                timeline=T)
        Survival = np.array(kmf.predict(T))

        self.KM = S_fun(self.x, self.M, T, Survival)
        self.KM.kmf = kmf
        self.KM.Sfun = self.KM.kmf.predict
        self.KM.mean = np.sum([(T[nn + 1] - T[nn]) * Survival[nn]
                               for nn in range(len(Survival) - 1)
                               ]).astype(float) + T[0]

        self.KM.σ = np.array(
            self.KM.kmf.survival_function_.std())[0].astype(float)
        self.KM.mean_σ = self.KM.σ
        self.KM.CI = np.array(self.KM.kmf.confidence_interval_)
        percents = np.array(
            [self.percentile(self.KM.Sfun, T, q / 100.) for q in σ_interval])
        self.KM.median = self.KM.kmf.median_
        self.KM.median_σ = 0.5 * np.diff(percents[1:])[0]
        self.current = 'KM'
    def marginal(self):
        #reverse KaplanMeier
        self.data['status'] = self.data['status'].values.astype(int) ^ 1

        #  weights at requested times
        if "IPCW.times" in self.what:
            kmf = KaplanMeierFitter()
            kmf.fit(self.data['failure_time'],
                    event_observed=self.data['status'].values,
                    timeline=self.times)
            self.weights = np.round(kmf.predict(self.times), decimals=4)
            #self.weights = kmf.conditional_time_to_event_(self.times)
            # self.times = predict(fit, newdata=data, times=times, level_chaos=1, mode="matrix", type="surv")
            self.times = []
        else:
            self.times = None

        # weights at subject specific event times
        if "IPCW.subject.times" in self.what:
            # self.subject_times = prodlim.predictSurvIndividual(fit, lag=self.lag)
            self.subject_times = []
        else:
            self.subject_times = None

        out = {
            'times': self.times,
            'subject_times': self.subject_times,
            'method': self.method
        }
        out = self.output(out, self.keep, self.times, self.fit, self.call)

        # class(out) < - "IPCW"
        return self.weights
示例#4
0
 def split(self,X,delta,y,cat=[],mode='train',df=[]):
     min_dur,max_dur = y.min(),y.max()
     times = np.linspace(min_dur,max_dur,100)
     d = delta.values
     kmf = KaplanMeierFitter()
     kmf.fit(y,1-delta)
     s_kmf = kmf.predict(y.squeeze()).values
     t_kmf = kmf.predict(times).values
     setattr(self,f'{mode}_times', torch.from_numpy(times.astype('float32')).float().unsqueeze(-1))
     setattr(self,f'{mode}_s_kmf', torch.from_numpy(s_kmf.astype('float32')).float().unsqueeze(-1))
     setattr(self,f'{mode}_t_kmf', torch.from_numpy(t_kmf.astype('float32')).float().unsqueeze(-1))
     setattr(self,f'{mode}_delta', torch.from_numpy(delta.astype('float32').values).float())
     setattr(self,f'{mode}_y', torch.from_numpy(y).float())
     setattr(self, f'{mode}_X', torch.from_numpy(X).float())
     if self.cat_cols:
         setattr(self, f'{mode}_cat_X', torch.from_numpy(df[cat].astype('int64').values).long())
示例#5
0
文件: km.py 项目: xcodevn/SADP
def fun(epsilon):
    li = []
    for kk in range(100):
        newdata_= laplace_mechanism(his , np.sqrt(2.0) / epsilon)

        newdata = [max([0.0, d]) for d in newdata_]

        ntime  = np.asarray([])
        nevent = np.asarray([])
        for i in range(bins0):
            ntime = np.append(ntime, np.linspace(bin_edges0[i], bin_edges0[i+1] , newdata[i]))
            #ntime = np.append(ntime, np.ones(newdata[i]) * 0.5 * (bin_edges0[i+1] + bin_edges0[i] )) # , newdata[i]))
            nevent = np.append(nevent,np.zeros(newdata[i]))

        for i in range(bins1):
            ntime = np.append(ntime,np.linspace(bin_edges1[i], bin_edges1[i+1], newdata[bins0 + i]))
            #ntime = np.append(ntime, np.ones(newdata[bins0 + i]) * 0.5 * (bin_edges1[i+1] +  bin_edges1[i] )) # , newdata[i]))
            nevent = np.append(nevent, np.ones(newdata[bins0+i]))

        kmf1 = KaplanMeierFitter()
        kmf1.fit(ntime, event_observed=nevent)
        #naf1.fit(ntime, event_observed=nevent)
        out = kmf1.predict(kmf.timeline)
        #pyplot.plot (naf1.timeline, naf1.cumulative_hazard_.values)
        #pyplot.plot (naf.timeline, naf.cumulative_hazard_.values)
        #pyplot.show()

        mre = ( np.linalg.norm(out - true_value[:,0]) / np.linalg.norm(true_value[:,0]) )
        li.append(mre)
    avg = np.average( li )
    #mean_relative_error.append(avg)
    print "(%f, %f)" % (epsilon, avg)
def kmplot(df_high, df_low):
	kmf_high = KaplanMeierFitter()
	kmf_low = KaplanMeierFitter()
	try:
		kmf_high.fit(durations = df_high.duration, event_observed = df_high.event, label = 'High: n = ' + str(len(df_high)))
		kmf_low.fit(durations = df_low.duration, event_observed = df_low.event, label = "Low: n = " + str(len(df_low)))
	except ValueError:
		return("NA", "0", "0", "0", "0")

	statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A = df_high.event, event_observed_B = df_low.event)
	p_value = statistics_result.p_value
                                       
	hm5 = kmf_high.predict(60)
	hm10 = kmf_high.predict(120)
	lm5 = kmf_low.predict(60)
	lm10 = kmf_low.predict(120)
	return(p_value, hm5, hm10, lm5, lm10)
示例#7
0
def get_censoring_dist(train_dataset):
    _dataset = train_dataset.dataset
    times, event_observed = [d['time_at_event']
                             for d in _dataset], [d['y'] for d in _dataset]
    all_observed_times = set(times)
    kmf = KaplanMeierFitter()
    kmf.fit(times, event_observed)

    censoring_dist = {time: kmf.predict(time) for time in all_observed_times}
    return censoring_dist
示例#8
0
    def predict(self, test_x, time_list):
        """
        for each test datapoint, find the k nearest neighbors, and use them to
        fit a Kaplan-Meier Model to get the survival function, and then use
        the survival function the calculate the median survival time

        :param test_df: DataFrame
        :param time_list: checkpoint time to calculate probability on
        :return: the list of median survival time and the probability matrix
        """
        test_df = pd.DataFrame(data=test_x, columns=self._feature_names)
        test_df = self._standardize_df(test_df, flag='test')
        reduced_test_df = self._test_pca(test_df)
        test_x = reduced_test_df.values
        # calculate distance matrix to find the nearest neighbors
        distance_matrix, neighbor_matrix = \
            self.neighbors.kneighbors(
                X=test_x,
                n_neighbors=int(np.min([self.n_neighbors, self.train_points]))
            )

        proba_matrix = []
        test_time_median_pred = []
        for test_idx, test_point in enumerate(test_x):
            # find the k nearest neighbors
            neighbor_train_y = \
                self.train_df.iloc[neighbor_matrix[test_idx]][
                    [self._duration_col, self._event_col]
                ]
            kmf = KaplanMeierFitter()
            kmf.fit(neighbor_train_y[self._duration_col],
                    neighbor_train_y[self._event_col])
            survival_proba = kmf.predict(time_list)
            # calculate the median survival time.
            # the median survival time is the time at which the survival proba.
            # equals to 0.5. Here the survival_proba is descending sorted from
            # 1 to 0, so we only need to find the first probability that <= 0.5
            median_time = np.max(time_list)
            for col, proba in enumerate(survival_proba):
                if proba > 0.5:
                    continue

                if proba == 0.5:
                    median_time = time_list[col]
                else:
                    # here we take the average of the time before and after
                    # Pr = 0.5
                    median_time = (time_list[col - 1] + time_list[col]) / 2
                break

            test_time_median_pred.append(median_time)
            proba_matrix.append(survival_proba)

        return np.array(test_time_median_pred), \
               pd.DataFrame(np.transpose(np.array(proba_matrix)), index=np.array(time_list))
def kmplot(df_high, df_low, ax):
	kmf_high = KaplanMeierFitter()
	kmf_low = KaplanMeierFitter()
	try:
		kmf_high.fit(durations = df_high.duration, event_observed = df_high.event, label = 'High: n = ' + str(len(df_high)))
		kmf_low.fit(durations = df_low.duration, event_observed = df_low.event, label = "Low: n = " + str(len(df_low)))
	except ValueError:
		return("NA", "0", "0", "0", "0")
	kmf_high.plot(ax = ax, color = "red", show_censors=True,  ci_show=False)
	kmf_low.plot(ax = ax, color = "black", show_censors=True, ci_show=False)
	statistics_result = logrank_test(df_high.duration, df_low.duration, event_observed_A = df_high.event, event_observed_B = df_low.event)
	p_value = statistics_result.p_value
	ax.set_xlabel('Time (months)')
	ax.set_ylabel('Probability')
	ax.text(0.95, 0.02, 'logrank P = ' + str('%.4f' % p_value), verticalalignment='bottom', horizontalalignment='right', transform=ax.transAxes,
        color = 'black', fontsize = 11)
	plt.legend(loc=3)
	hm5 = kmf_high.predict(60)
	hm10 = kmf_high.predict(120)
	lm5 = kmf_low.predict(60)
	lm10 = kmf_low.predict(120)
	return(p_value, hm5, hm10, lm5, lm10)
示例#10
0
def binomial_log_likelihood_km(times, prob_alive, durations, events, eps=1e-7):
    '''Compute the binomial log-likelihood for survival at given times.

    We compute binomial log-likelihood weighted by the inverse censoring distribution.
    This is the same weighting scheeme as for the brier score.

    Parameters:
        times: Number or iterable with times where to compute the brier scores.
        prob_alive: Numpy array [len(times), len(durations)] with the estimated probabilities
            of each individual to be alive at each time in `times`. Each row represets
            a time in input array `times`.
        durations: Numpy array with time of events.
        events: Boolean numpy array indecating if dead/censored (True/False).
        eps: Clip prob_alive at (eps, 1-eps).

    Returns:
        Numpy array with brier scores.
    '''
    from lifelines import KaplanMeierFitter
    if not hasattr(times, '__iter__'):
        times = [times]
    assert prob_alive.__class__ is np.ndarray, 'Need numpy array'
    assert prob_alive.shape == (len(times), len(durations)),\
        'Need prob_alive to have dims [len(times), len(durations)].'
    kmf_censor = KaplanMeierFitter()
    kmf_censor.fit(durations, 1 - events)
    km_censor_at_durations = kmf_censor.survival_function_.loc[
        durations].values.flatten()
    km_censor_at_times = kmf_censor.predict(times)

    prob_alive = np.clip(prob_alive, eps, 1 - eps)

    def compute_score(time_, km_censor_at_time, prob_alive_):
        died = ((durations <= time_) & (events == True))
        survived = (durations > time_)
        event_part = np.log(1 -
                            prob_alive_[died]) / km_censor_at_durations[died]
        survived_part = np.log(prob_alive_[survived]) / km_censor_at_time
        return (np.sum(event_part) + np.sum(survived_part)) / len(durations)

    scores = [
        compute_score(time_, km, pa)
        for time_, km, pa in zip(times, km_censor_at_times, prob_alive)
    ]
    return np.array(scores)
示例#11
0
def brier_score_km(times, prob_alive, durations, events):
    '''Compute the brier scores (for survival) at given times.

    For a specification on brier scores for survival data see e.g.:
    "Assessment of evaluation criteria for survival prediction from
    genomic data" by Bovelstad and Borgan.

    Parameters:
        times: Number or iterable with times where to compute the brier scores.
        prob_alive: Numpy array [len(times), len(durations)] with the estimated probabilities
            of each individual to be alive at each time in `times`. Each row represets
            a time in input array `times`.
        durations: Numpy array with time of events.
        events: Boolean numpy array indecating if dead/censored (True/False).

    Returns:
        Numpy array with brier scores.
    '''
    from lifelines import KaplanMeierFitter
    if not hasattr(times, '__iter__'):
        times = [times]
    assert prob_alive.__class__ is np.ndarray, 'Need numpy array'
    assert prob_alive.shape == (len(times), len(durations)),\
        'Need prob_alive to have dims [len(times), len(durations)].'
    kmf_censor = KaplanMeierFitter()
    kmf_censor.fit(durations, 1 - events)
    # km_censor_at_durations = kmf_censor.predict(durations)
    km_censor_at_durations = kmf_censor.survival_function_.loc[
        durations].values.flatten()
    km_censor_at_times = kmf_censor.predict(times)

    def compute_score(time_, km_censor_at_time, prob_alive_):
        died = ((durations <= time_) & (events == True))
        survived = (durations > time_)
        event_part = (prob_alive_**2)[died] / km_censor_at_durations[died]
        survived_part = ((1 - prob_alive_)**2)[survived] / km_censor_at_time
        return (np.sum(event_part) + np.sum(survived_part)) / len(durations)

    b_scores = [
        compute_score(time_, km, pa)
        for time_, km, pa in zip(times, km_censor_at_times, prob_alive)
    ]
    return np.array(b_scores)
示例#12
0
    ridge_brier_scores = []

    kmf = KaplanMeierFitter()
    kmf.fit(
        event_time_train,
        event_observed=event_indicator_train,
        timeline=eval_times_brier_score,
    )

    for s in np.unique(strata):

        strata_train_dat = survival_data_train[strata_train == s]
        strata_test_dat = survival_data_test[strata_test == s]

        kaplan_preds = np.repeat(
            [kmf.predict(eval_times_brier_score).to_numpy()],
            strata_test_dat.shape[0],
            axis=0,
        )

        times, km_score = brier_score(
            survival_train=strata_train_dat,
            survival_test=strata_test_dat,
            estimate=kaplan_preds,
            times=eval_times_brier_score,
        )

        kaplan_brier_scores.append(km_score)
        kaplan_group_sizes.append(strata_test_dat.shape[0])

    kmf_brier_scores = np.average(np.stack(kaplan_brier_scores),
示例#13
0
Female.head()

#Fit data into objects:
kmf_m.fit(durations=Male["time"], event_observed=Male["dead"], label="Male")
kmf_f.fit(durations=Female["time"],
          event_observed=Female["dead"],
          label="Female")

#Event table for male group:
kmf_m.event_table

#Event table for female group:
kmf_f.event_table

#Predict value based on time:
kmf_m.predict(11)

#Predict value based on time:
kmf_f.predict(11)

#Get complete data of survival function for male group:
kmf_m.survival_function_

#Get complete data of survival function for female group:
kmf_f.survival_function_

#Plot the survival_function data:
kmf_m.plot()
kmf_f.plot()
plt.xlabel("Days Passed")
plt.ylabel("Survival Probability")
示例#14
0
Female.head()

#Fit data into objects:
km_m.fit(durations=Male["time"], event_observed=Male["dead"], label="Male")
km_f.fit(durations=Female["time"],
         event_observed=Female["dead"],
         label="Female")

#Event table for male group:
km_m.event_table

#Event table for female group:
km_f.event_table

#Predict value based on time:
km_m.predict(11)

#Predict value based on time:
km_f.predict(11)

#Get complete data of survival function for male group:
km_m.survival_function_

#Get complete data of survival function for female group:
km_f.survival_function_

#Plot the survival_function data:
km_m.plot()
km_f.plot()
plt.xlabel("Days Passed")
plt.ylabel("Survival Probability")
def predictNextDeal(targetGameDetails, publisherGameDetails, genreGameDetails,
                    filledHistories, validate=False):
#%%
#------------------------------------------------------------------------------
# parse inputs for needed data formats
#------------------------------------------------------------------------------
    # get target game information
    ind = targetGameDetails.index.tolist()
    targetReleaseDate = targetGameDetails['Release Data'][ind[0]]
    
    # get price history information
    targetHistory = filledHistories[0]
    publisherHistory = filledHistories[1]
    genreHistory = filledHistories[2]
    
#%%
#------------------------------------------------------------------------------
# feature engineering: target price history
# 1) transform dates to days since release
# 2) find time when deals occur and duration from last deal's end to current deal's start
#------------------------------------------------------------------------------
    target_dur_until_deal = []
    targetDaysSinceRelease = []
    if len(targetHistory) > 0:
        # Part 1 - transform dates to days since release
        # convert timestamps to dates 
        targetGameDates = []
        for ts in targetHistory['timestamps']:
            targetGameDates.append(datetime.datetime.fromtimestamp(ts).date())
        
        # convert dates --> days since release
        # transform specific dates to time deltas (in days) from release date
        # i.e days from release_date to timepoint1, release_date to timepoint2, etc.
        try:
            targetfirstdayDT = datetime.datetime.strptime(targetReleaseDate, '%b %d, %Y').date()
        except:
            try:
                targetfirstdayDT = datetime.datetime.strptime(targetReleaseDate, '%b %Y').date()
            except:
                targetfirstdayDT = parse(targetReleaseDate, ignoretz=True).date()
        
        for i in range(len(targetGameDates)):
            delta = targetGameDates[i] - targetfirstdayDT
            targetDaysSinceRelease.append(delta.days)
        
#        plt.figure()
#        plt.scatter(targetDaysSinceRelease, targetHistory.iloc[:,1])
#        plt.title("Price History " + targetName, fontsize=50)
#        plt.xlabel("Days Since Release", fontsize=30)
#        plt.ylabel("Price", fontsize=30)
#        ax = plt.gca()
#        ax.tick_params(axis = 'both', which = 'major', labelsize = 20)
#        ax.tick_params(axis = 'both', which = 'minor', labelsize = 12)
        
        # Part 2 - find time when deals occur and the duration from last deal's end
        # find change in target price history time series
        targetPriceDiff = targetHistory.iloc[:,1].diff()
        
        # combine days since release and price history change into a matrix
        targetPriceChanges = np.zeros([len(targetPriceDiff), 2])
        for i in range(len(targetPriceChanges)):
            tempDate = targetDaysSinceRelease[i]
            tempPrice = targetPriceDiff.iloc[i]
            targetPriceChanges[i,:] = [tempDate, tempPrice]
        
        # remove first row with nan
        priceChanges = targetPriceChanges[1:,:]
        
        # remove rows with zeros
        priceChanges = priceChanges[~(priceChanges==0).any(1),:]
        
        # markers for when deals start
        targetDealStart = priceChanges[(priceChanges<0).any(1),:]
        
        # markers for when deals end
        targetDealEnd = priceChanges[~(priceChanges<0).any(1),:]
        firstRow = targetPriceChanges[0,:]
        targetDealEnd = np.vstack([firstRow, targetDealEnd])
        
        # find duration between deals
        for i in range(len(targetDealStart[:,0])):
            # find index of day for current deal
            dealDay = targetDealStart[i, 0]
            daysColumn = targetPriceChanges[:,0]
            dealDayInd = np.where(daysColumn == dealDay)[0][0]
            
            # find day for when last normal price began
            temp = targetPriceChanges[:int(dealDayInd),:]
            previousNonZero = temp[(temp[:,1]!=0),:]
            originDay = previousNonZero[-1][0]
            
            # find time elapsed since last deal ended
            duration = dealDay - originDay
            
            target_dur_until_deal = np.hstack([target_dur_until_deal, duration])
        
        # remove first row with zero initialization
        target_dur_until_deal = target_dur_until_deal[1:]
    
#%%
#------------------------------------------------------------------------------
# feature engineering: publisher price history
# 1) transform dates to days since release
# 2) find time when deals occur and duration from last deal's end to current deal's start
#------------------------------------------------------------------------------
    # Part 1 - transform dates to days since release
    # convert dates --> days since release
    # transform specific dates to time deltas (in days) from release date
    # i.e days from release_date to timepoint1, release_date to timepoint2, etc.
    publisherDaysSinceRelease = []
    # make sure there are still other publisher games
    if len(publisherHistory) > 0:
        publisherGameTitles = []
        
        for ind in range(len(publisherHistory)):
            dealDates = []
            for ts in publisherHistory[ind]['timestamps']:
                        dealDates.append(datetime.datetime.fromtimestamp(ts).date())
            
            # get first day
            columnnames = list(publisherHistory[ind].columns.values)
            ind2 = publisherGameDetails.loc[publisherGameDetails['Game Title'] == columnnames[1]].index.tolist()
            firstday = publisherGameDetails['Release Data'][ind2[0]]
            try:
                firstdayDT = datetime.datetime.strptime(firstday, '%b %d, %Y').date()
            except:
                try:
                    firstdayDT = datetime.datetime.strptime(firstday, '%d %b, %Y').date()
                except:
                    try:
                        firstdayDT = datetime.datetime.strptime(firstday, '%B %dth, %Y').date()
                    except:
                        try:
                            firstdayDT = datetime.datetime.strptime(firstday, '%b %Y').date()
                        except:
                            firstdayDT = parse(firstday, ignoretz=True).date()
            
            # calculate time delta from release date
            gameDelta = []
            for i in range(len(dealDates)):
                timeDelta = dealDates[i] - firstdayDT
                gameDelta.append(timeDelta.days)
            publisherGameTitles.append(columnnames[1])
            publisherDaysSinceRelease.append(gameDelta)

    # visualize publisher price history x days since release
#    for ind in range(len(publisherHistory)):
#        plt.scatter(publisherDaysSinceRelease[ind], publisherHistory[ind][publisherGameTitles[ind]])
##        plt.title("Price History for Ubisoft Games", fontsize=50)
#        plt.title("Price History for " + publisherGameTitles[ind], fontsize=50)
#        plt.xlabel("Days Since Release", fontsize=30)
#        plt.ylabel("Price", fontsize=30)
#        ax = plt.gca()
#        ax.tick_params(axis = 'both', which = 'major', labelsize = 20)
#        ax.tick_params(axis = 'both', which = 'minor', labelsize = 12)
#        ax.set_ylim([0, 70])
    
    # Part 2 - find time when deals occur and the duration from last deal's end
    publisher_dur_until_deal = []
    if len(publisherHistory) > 0:
        
        for ind in range(len(publisherHistory)):
            # find change in publisher price history time series
            publisherPriceDiff = publisherHistory[ind].iloc[:,1].diff()
            
            # combine days since release and price history change into a matrix
            publisherPriceChanges = np.zeros([len(publisherPriceDiff), 2])
            for i in range(len(publisherPriceChanges)):
                tempDate = publisherDaysSinceRelease[ind][i]
                tempPrice = publisherPriceDiff.iloc[i]
                publisherPriceChanges[i,:] = [tempDate, tempPrice]
    
            # remove first row with nan
            priceChanges = publisherPriceChanges[1:,:]
    
            # remove rows with zeros
            priceChanges = priceChanges[~(priceChanges==0).any(1),:]
    
            # markers for when deals start
            publisherDealStart = priceChanges[(priceChanges<0).any(1),:]
    
            # markers for when deals end
            publisherDealEnd = priceChanges[~(priceChanges<0).any(1),:]
            firstRow = publisherPriceChanges[0,:]
            publisherDealEnd = np.vstack([firstRow, publisherDealEnd])
    
            # find duration between deals
            temp_dur_until_deal = np.zeros(1)
            for i in range(len(publisherDealStart[:,0])):
                # find index of day for current deal
                dealDay = publisherDealStart[i, 0]
                daysColumn = publisherPriceChanges[:,0]
                dealDayInd = np.where(daysColumn == dealDay)[0][0]
                
                # find day for when last normal price began
                temp = publisherPriceChanges[:int(dealDayInd),:]
                previousNonZero = temp[(temp[:,1]!=0),:]
                originDay = previousNonZero[-1][0]
                
                # find time elapsed since last deal ended
                duration = dealDay - originDay
                
                temp_dur_until_deal = np.hstack([temp_dur_until_deal, duration])
    
            # remove first row with zero initialization
            temp_dur_until_deal = temp_dur_until_deal[1:]
            
            # stack publisher durations in matrix
            publisher_dur_until_deal.append(temp_dur_until_deal)
    
#%%
#------------------------------------------------------------------------------
# feature engineering: genre price history
# 1) transform dates to days since release
# 2) find time when deals occur and duration from last deal's end to current deal's start
#------------------------------------------------------------------------------
    # Part 1 - transform dates to days since release
    # convert dates --> days since release
    # transform specific dates to time deltas (in days) from release date
    # i.e days from release_date to timepoint1, release_date to timepoint2, etc.
    genreDaysSinceRelease = []
    # make sure there are still other publisher games
    if len(genreHistory) > 0 and len(publisherHistory) < 1:
        genreGameTitles = []
        
        for ind in range(len(genreHistory)):
            dealDates = []
            for ts in genreHistory[ind]['timestamps']:
                        dealDates.append(datetime.datetime.fromtimestamp(ts).date())
            
            # get first day
            columnnames = list(genreHistory[ind].columns.values)
            ind2 = genreGameDetails.loc[genreGameDetails['Game Title'] == columnnames[1]].index.tolist()
            firstday = genreGameDetails['Release Data'][ind2[0]]
            try:
                firstdayDT = datetime.datetime.strptime(firstday, '%b %d, %Y').date()
            except:
                try:
                    firstdayDT = datetime.datetime.strptime(firstday, '%d %b, %Y').date()
                except:
                    try:
                        firstdayDT = datetime.datetime.strptime(firstday, '%B %dth, %Y').date()
                    except:
                        try:
                            firstdayDT = datetime.datetime.strptime(firstday, '%b %Y').date()
                        except:
                            firstdayDT = parse(firstday, ignoretz=True).date()
            
            # calculate time delta from release date
            gameDelta = []
            for i in range(len(dealDates)):
                timeDelta = dealDates[i] - firstdayDT
                gameDelta.append(timeDelta.days)
            genreGameTitles.append(columnnames[1])
            genreDaysSinceRelease.append(gameDelta)

    ## visualize publisher price history x days since release
#    for ind in range(len(publisherHistory)):
#        plt.scatter(publisherDaysSinceRelease[ind], publisherHistory[ind][publisherGameTitles[ind]])
#        plt.title("Price History for Ubisoft Games", fontsize=50)
#        plt.xlabel("Days Since Release", fontsize=30)
#        plt.ylabel("Price", fontsize=30)
#        ax = plt.gca()
#        ax.tick_params(axis = 'both', which = 'major', labelsize = 20)
#        ax.tick_params(axis = 'both', which = 'minor', labelsize = 12)
    
    # Part 2 - find time when deals occur and the duration from last deal's end
    genre_dur_until_deal = []
    if len(genreHistory) > 0 and len(publisherHistory) < 1:
        for ind in range(len(genreHistory)):
            # find change in publisher price history time series
            genrePriceDiff = genreHistory[ind].iloc[:,1].diff()
            
            # combine days since release and price history change into a matrix
            genrePriceChanges = np.zeros([len(genrePriceDiff), 2])
            for i in range(len(genrePriceChanges)):
                tempDate = genreDaysSinceRelease[ind][i]
                tempPrice = genrePriceDiff.iloc[i]
                genrePriceChanges[i,:] = [tempDate, tempPrice]
    
            # remove first row with nan
            priceChanges = genrePriceChanges[1:,:]
    
            # remove rows with zeros
            priceChanges = priceChanges[~(priceChanges==0).any(1),:]
    
            # markers for when deals start
            genreDealStart = priceChanges[(priceChanges<0).any(1),:]
    
            # markers for when deals end
            genreDealEnd = priceChanges[~(priceChanges<0).any(1),:]
            firstRow = genrePriceChanges[0,:]
            genreDealEnd = np.vstack([firstRow, genreDealEnd])
    
            # find duration between deals
            temp_dur_until_deal = np.zeros(1)
            for i in range(len(genreDealStart[:,0])):
                # find index of day for current deal
                dealDay = genreDealStart[i, 0]
                daysColumn = genrePriceChanges[:,0]
                dealDayInd = np.where(daysColumn == dealDay)[0][0]
                
                # find day for when last normal price began
                temp = genrePriceChanges[:int(dealDayInd),:]
                previousNonZero = temp[(temp[:,1]!=0),:]
                originDay = previousNonZero[-1][0]
                
                # find time elapsed since last deal ended
                duration = dealDay - originDay
                
                temp_dur_until_deal = np.hstack([temp_dur_until_deal, duration])
    
            # remove first row with zero initialization
            temp_dur_until_deal = temp_dur_until_deal[1:]
            
            # stack publisher durations in matrix
            genre_dur_until_deal.append(temp_dur_until_deal)
    
#%%
#------------------------------------------------------------------------------
# machine learning: probability of deal
#------------------------------------------------------------------------------
    if validate is True and len(targetHistory) > 0:
        if len(target_dur_until_deal) > 1:
            # choose random target deal that has already occurred
#            testDuration = random.choice(target_dur_until_deal)
            # choose last target deal that has already occurred
            testDuration = target_dur_until_deal[-1]
            
            # get row index of target price in target unique price matrix
            ind = np.where(target_dur_until_deal == testDuration)[0][0]
            
            # remove target price and prices after it
            target_dur_until_deal = np.delete(target_dur_until_deal, np.s_[ind])
        else:
            testDuration = np.NaN
    else:
        testDuration = np.NaN
    
    
    # combine all data points into single array
    all_dur_until_deal = np.zeros(1) # initialize array
    if len(target_dur_until_deal) > 0:
        all_dur_until_deal = target_dur_until_deal
    
    if len(publisher_dur_until_deal) > 0:
        for ind in range(len(publisher_dur_until_deal)):
            all_dur_until_deal = np.hstack((all_dur_until_deal, publisher_dur_until_deal[ind]))
    
    if len(genre_dur_until_deal) > 0:
        for ind in range(len(genre_dur_until_deal)):
            all_dur_until_deal = np.hstack((all_dur_until_deal, genre_dur_until_deal[ind]))
    
    # create event array
    all_event = np.ones(all_dur_until_deal.shape)
    
    # get days since last target deal
    if len(targetDaysSinceRelease) > 0:
        censoredNextDealDur = targetDaysSinceRelease[-1] - targetDealEnd[-1][0]
    else:
        censoredNextDealDur = 0
    
    # add days since last target deal to duration and event matrices
    all_dur_until_deal = np.append(all_dur_until_deal, censoredNextDealDur)
    all_event = np.append(all_event, 0)
    
    # transform to Pandas dataframe for fitting
    tempmat = np.vstack((all_dur_until_deal, all_event)).T
    columns = ['Duration', 'Event']
    duration = pd.DataFrame(tempmat, columns=columns)
    
    # estimate survival function using Kaplan-Meier estimator
    kmf = KaplanMeierFitter()
    kmf.fit(duration['Duration'], event_observed=duration['Event'])
    
    # visualize probability of deals
#    kmf.survival_function_.plot(linewidth=3.3)
#    plt.title('Survival Function of No Deals (Activision Games)', fontsize=50)
#    plt.xlabel("Days Since Last Deal", fontsize=30)
#    plt.ylabel("Probability No Deal Tomorrow", fontsize=30)
#    ax = plt.gca()
#    ax.xaxis.set_label_coords(0.5,-.07)
#    ax.yaxis.set_label_coords(-.05,0.5)
#    ax.tick_params(axis = 'both', which = 'major', labelsize = 20)
#    ax.tick_params(axis = 'both', which = 'minor', labelsize = 12)
#    ax.set_xlim([0, 160])
    
    # probability a deal will occur tomorrow
    probDealTomorrow = 1 - kmf.predict(censoredNextDealDur)
    
    # when high probability (0.80) deal will occur tomorrow
    try:
        kmf_survival = kmf.survival_function_
        threshold = kmf_survival[kmf_survival['KM_estimate'] < .10]
        nextHighProbDay = threshold.index[0]
        nextHighProbDay = nextHighProbDay - censoredNextDealDur
    except:
        nextHighProbDay = np.NaN
    
    # Cox Proportional Hazard Model (regression model for future)
#    cph = CoxPHFitter()
#    cph.fit(duration, duration_col=0, show_progress=True)
#    cph.print_summary()  # access the results using cph.summary
#    
#    cph.predict_partial_hazard(duration)
#    cph.predict_survival_function(duration, times=[5., 25., 50.])
#    cph.predict_median(duration)
    
#%%
    return list([probDealTomorrow, nextHighProbDay, testDuration])
event_at_0 = kmf.event_table.iloc[0, :]
# now calculate the survival probability for t = 0
surv_for_0 =  (event_at_0.at_risk - event_at_0.observed) / event_at_0.at_risk

# Calculate the survival probability for t = 1
event_at_1 = kmf.event_table.iloc[1, :]
surv_for_1 =  (event_at_1.at_risk - event_at_1.observed) / event_at_1.at_risk

# Calculate the survival probability for t = 2
event_at_2 = kmf.event_table.iloc[2, :]
surv_for_2 =  (event_at_2.at_risk - event_at_2.observed) / event_at_2.at_risk

# The probability that an NFL player has a career longer than 2 years
surv_after_2 = surv_for_0 * surv_for_1 * surv_for_2

kmf.predict(2)

# The survival probabilities of NFL players after 1, 3, 5, and 10 yrs played
kmf.predict([1,3,5,10])

kmf.survival_function_

kmf.median_

# plot the KM estimate
kmf.plot()
# Add title and y-axis label
plt.title("The Kaplan-Meier Estimate for Drafted NFL Players\n(1967-2015)")
plt.ylabel("Probability a Player is Still Active")

plt.show()
示例#17
0
print(kmf_train.median_survival_time_)
print(kmf_test.median_survival_time_)

# In[28]:

print(median_survival_times(kmf_train.confidence_interval_))
print(median_survival_times(kmf_test.confidence_interval_))

# In[29]:

print(kmf_train.event_table)
print(kmf_test.event_table)

# In[30]:

print('Survival probability for t=60 for train set: ', kmf_train.predict(60))
print('Survival probability for t=60 for test set: ', kmf_test.predict(60))

# In[31]:

results = logrank_test(train['PFS'],
                       test['PFS'],
                       train['disease_progress'],
                       test['disease_progress'],
                       alpha=.95)

results.print_summary()

# In[ ]:

# In[32]:
示例#18
0
#Calculating the actual survival probability at a given time:

surv_after_0 = surv_for_0 
print("Survival Probability After 0 Days: ",surv_after_0)

#Calculating the actual survival probability at a given time:
surv_after_5 = surv_for_0 * surv_for_5
print("Survival Probability After 5 Days: ",surv_after_5)


#Calculating the actual survival probability at a given time:surv_after_11 = surv_for_0 * surv_for_5 * surv_for_11
print("Survival Probability After 11 Days: ",surv_after_11)

#Get the probability values the easy way!
print("Survival probability for t=0: ",kmf.predict(0))
print("Survival probability for t=5: ",kmf.predict(5))
print("Survival probability for t=11: ",kmf.predict(11))

#Predicting the surviaval probability for an array of value:
kmf.predict([0,5,11,12])

#To get the full list:
kmf.survival_function_

#Plot the graph:
kmf.plot()
plt.title("The Kaplan-Meier Estimate")
plt.xlabel("Number of days")
plt.ylabel("Probability of survival")
示例#19
0
def _calibration_curve_ipcw(out,
                            e,
                            t,
                            a,
                            group,
                            eval_time,
                            typ,
                            ret_bins=True,
                            strat='quantile',
                            n_bins=10):
    """Returns the Calibration curve and the bins given some risk scores.

  Accepts the output of a trained survival model at a certain evaluation time,
  the event indicators and protected group membership and outputs an IPCW
  adjusted calibration curve.

  Args:
    out:
      risk scores P(T>t) issued by a trained survival analysis model
      (output of fair_survival_analysis.models.predict_survival).
    e:
      a numpy vector of indicators specifying is event or censoring occured.
    t:
      a numpy vector of times at which the events or censoring occured.
    a:
      a numpy vector of protected attributes.
    group:
      string indicating the demogrpahic to evaluate calibration for.
    eval_time:
      float/int of the event time at which calibration is to be evaluated. Must
      be same as the time at which the Risk Scores were issues.
    typ:
      Determines if the calibration curves are to be computed on the individuals
      that experienced the event or adjusted estimates for individuals that are
      censored using IPCW estimator on a population or subgroup level
    ret_bins:
      Boolean that specifies if the bins of the calibration curve are to be
      returned.
    strat:
      Specifies how the bins are computed. One of:
      "quantile": Equal sized bins.
      "uniform": Uniformly stratified.
    n_bins:
      int specifying the number of bins to use to compute the ece.
  Returns:
    Calibration Curve: A tuple of True Probality, Estimated Probability in
    each bin and the estimated Expected Calibration Error.

  """

    if typ == 'IPCWpop':
        kmf = KaplanMeierFitter().fit(t, 1 - e)

    else:
        t_ = t[a == group]
        e_ = e[a == group]

        kmf = KaplanMeierFitter().fit(t_, 1 - e_)

    out_ = out.copy()

    e = e[a == group]
    t = t[a == group]
    out = out[a == group]

    y = t > eval_time

    if strat == 'quantile':

        quantiles = [(1. / n_bins) * i for i in range(n_bins + 1)]
        outbins = np.quantile(out, quantiles)

    if strat == 'uniform':

        binlen = (out.max() - out.min()) / n_bins
        outbins = [out.min() + i * binlen for i in range(n_bins + 1)]

    prob_true = []
    prob_pred = []

    ece = 0

    for n_bin in range(n_bins):

        binmin = outbins[n_bin]
        binmax = outbins[n_bin + 1]

        scorebin = (out >= binmin) & (out <= binmax)

        weight = float(len(scorebin)) / len(out)

        out_ = out[scorebin]
        y_ = y[scorebin]

        y_ = y_ / kmf.predict(eval_time)

        pred = y_.mean()

        prob_true.append(pred)

        prob_pred.append(out_.mean())

        gap = abs(prob_pred[-1] - prob_true[-1])

        ece += weight * gap

    if ret_bins:
        return prob_true, prob_pred, outbins, ece

    else:
        return prob_true, prob_pred, ece
            else:
                cls = 'Nonresponder'
            month_dic[cls].append(surDic[pat]['months'])
            status_dic[cls].append(surDic[pat]['status'])

        # logrank Test
        results = logrank_test(month_dic['Responder'],
                               month_dic['Nonresponder'],
                               event_observed_A=status_dic['Responder'],
                               event_observed_B=status_dic['Nonresponder'])
        pvalue = results.p_value

        for cls in month_dic:
            kmf = KaplanMeierFitter()
            kmf.fit(month_dic[cls], status_dic[cls])
            fiveYear_dic[cls] = kmf.predict(60)

        # draw survival plot
        f = plt.figure(figsize=(4, 4))
        ax = f.add_subplot(1, 1, 1)
        plt.title('%s / %s / %s / %s\npvalue=%.4f\n' %
                  (cancer_type, drug, ML, testing_pathway_rank, pvalue),
                  fontsize=8)

        c1 = KaplanMeierFitter()
        ax = c1.fit(month_dic['Responder'],
                    status_dic['Responder'],
                    label='Responder (n=%s)' %
                    len(month_dic['Responder'])).plot(ax=ax,
                                                      ci_show=True,
                                                      c='r')
示例#21
0
######## Survival probability at t=0 only
event_at_0 = kmf.event_table.iloc[0, :]
survival_for_0 = (event_at_0.at_risk -
                  event_at_0.observed) / event_at_0.at_risk
print("Surival probability at time 0 only is : ", survival_for_0)

######## Survival probability at t=5 only
event_at_5 = kmf.event_table.iloc[1, :]
survival_for_5 = (event_at_5.at_risk -
                  event_at_5.observed) / event_at_5.at_risk
print("Surival probability at time 5 only is : ", survival_for_5)

######## Survival probability at t=13 only
event_at_13 = kmf.event_table.iloc[4, :]
survival_for_13 = (event_at_13.at_risk -
                   event_at_13.observed) / event_at_13.at_risk
print("Surival probability at time 13 only is : ", survival_for_13)

##### Survival probability probability after 5 days (for t= 5)
survival_after_5 = survival_for_0 * survival_for_5
print("\nSurvival Probability after 5 days : ", survival_after_5)

#### Automate the work we've done above
print("\nSurvival Probability after 5 days : ", kmf.predict(5))
print("Survival Probability after 3 days : ", kmf.predict(13))
print("Survival Probability after 1022 days : ", kmf.predict(1022))

#### Survival probability for whole timeline
print("\n", kmf.survival_function_)
示例#22
0
def survival_difference_at_fixed_point_in_time_test(
        point_in_time,
        durations_A,
        durations_B,
        event_observed_A=None,
        event_observed_B=None,
        **kwargs) -> StatisticalResult:
    """

    Often analysts want to compare the survival-ness of groups at specific times, rather than comparing the entire survival curves against each other.
    For example, analysts may be interested in 5-year survival. Statistically comparing the naive Kaplan-Meier points at a specific time
    actually has reduced power (see [1]). By transforming the Kaplan-Meier curve, we can recover more power. This function uses
    the log(-log) transformation.


    Parameters
    ----------
    point_in_time: float,
        the point in time to analyze the survival curves at.

    durations_A: iterable
        a (n,) list-like of event durations (birth to death,...) for the first population.

    durations_B: iterable
        a (n,) list-like of event durations (birth to death,...) for the second population.

    event_observed_A: iterable, optional
        a (n,) list-like of censorship flags, (1 if observed, 0 if not), for the first population.
        Default assumes all observed.

    event_observed_B: iterable, optional
        a (n,) list-like of censorship flags, (1 if observed, 0 if not), for the second population.
        Default assumes all observed.

    kwargs:
        add keywords and meta-data to the experiment summary


    Returns
    -------

    StatisticalResult
      a StatisticalResult object with properties ``p_value``, ``summary``, ``test_statistic``, ``print_summary``

    Examples
    --------
    .. code:: python

        T1 = [1, 4, 10, 12, 12, 3, 5.4]
        E1 = [1, 0, 1,  0,  1,  1, 1]

        T2 = [4, 5, 7, 11, 14, 20, 8, 8]
        E2 = [1, 1, 1, 1,  1,  1,  1, 1]

        from lifelines.statistics import survival_difference_at_fixed_point_in_time_test
        results = survival_difference_at_fixed_point_in_time_test(12, T1, T2, event_observed_A=E1, event_observed_B=E2)

        results.print_summary()
        print(results.p_value)        # 0.893
        print(results.test_statistic) # 0.017

    Notes
    -----
    Other transformations are possible, but Klein et al. [1] showed that the log(-log(c)) transform has the most desirable
    statistical properties.

    References
    -----------

    [1] Klein, J. P., Logan, B. , Harhoff, M. and Andersen, P. K. (2007), Analyzing survival curves at a fixed point in time. Statist. Med., 26: 4505-4519. doi:10.1002/sim.2864

    """

    kmfA = KaplanMeierFitter().fit(durations_A,
                                   event_observed=event_observed_A)
    kmfB = KaplanMeierFitter().fit(durations_B,
                                   event_observed=event_observed_B)

    sA_t = kmfA.predict(point_in_time)
    sB_t = kmfB.predict(point_in_time)

    # this is doing a prediction/interpolation between the kmf's index.
    sigma_sqA = interpolate_at_times_and_return_pandas(kmfA._cumulative_sq_,
                                                       point_in_time)
    sigma_sqB = interpolate_at_times_and_return_pandas(kmfB._cumulative_sq_,
                                                       point_in_time)

    log = np.log
    clog = lambda s: log(-log(s))

    X = (clog(sA_t) - clog(sB_t))**2 / (sigma_sqA / log(sA_t)**2 +
                                        sigma_sqB / log(sB_t)**2)
    p_value = _chisq_test_p_value(X, 1)

    return StatisticalResult(
        p_value,
        X,
        null_distribution="chi squared",
        degrees_of_freedom=1,
        point_in_time=point_in_time,
        test_name="survival_difference_at_fixed_point_in_time_test",
        **kwargs)
示例#23
0
      )  ## group i2 , having the pandas series  for the 2nd cohort

## fit the model for 1st cohort
kmf.fit(T[i1], E[i1], label='No Partner')
a1 = kmf.plot()

## fit the model for 2nd cohort
kmf.fit(T[i2], E[i2], label='Partner')
kmf.plot(ax=a1)

#### 3 new cohorts are compared
# 1. Contract type is month-to-month
# 2. Contract type is Two
# 2. Contract type is One year
groups = input_df['Contract']  ## Create the cohorts from the 'Contract' column
ix1 = (groups == 'Month-to-month')  ## Cohort 1
ix2 = (groups == 'Two year')  ## Cohort 2
ix3 = (groups == 'One year')  ## Cohort 3

kmf.fit(T[ix1], E[ix1], label='Month-to-month')  ## fit the cohort 1 data
ax = kmf.plot()

kmf.fit(T[ix2], E[ix2], label='Two year')  ## fit the cohort 2 data
ax1 = kmf.plot(ax=ax)

kmf.fit(T[ix3], E[ix3], label='One year')  ## fit the cohort 3 data
kmf.plot(ax=ax1)  ## Plot the KM curve for three cohort on same x and y axis

print(kmf.predict(T[0]))
plt.show()