def _fit_model(self, T, E, entry, show_progress=True): non_zero_entries = entry[entry > 0] with warnings.catch_warnings(): warnings.simplefilter("ignore") results = minimize( value_and_grad(self._negative_log_likelihood), # pylint: disable=no-value-for-parameter self._initial_values, jac=True, method="L-BFGS-B", args=(T, E, non_zero_entries), bounds=self._bounds, options={"disp": show_progress}, ) if results.success: # pylint: disable=no-value-for-parameter hessian_ = hessian(self._negative_log_likelihood)(results.x, T, E, non_zero_entries) return results.x, -results.fun * T.shape[0], T.shape[0] * hessian_ print(results) if self._KNOWN_MODEL: raise ConvergenceError( dedent( """\ Fitting did not converge. This is mostly a lifelines problem, but a few things you can check: 1. Are there any extreme values in the durations column? - Try scaling your durations to a more reasonable values closer to 1 (multipling or dividing by some 10^n). - Try dropping them to see if the model converges. """ ) ) else: raise ConvergenceError( dedent( """\ Fitting did not converge. 1. Are two parameters in the model colinear / exchangeable? (Change model) 2. Is the cumulative hazard always non-negative and always non-decreasing? (Assumption error) 3. Are there inputs to the cumulative hazard that could produce nans or infs? (Check your _bounds) This could be a problem with your data: 1. Are there any extreme values in the durations column? - Try scaling your durations to a more reasonable value closer to 1 (multipling or dividing by a large constant). - Try dropping them to see if the model converges. """ ) )
def _fit_model(self, T, E, *Xs, **kwargs): # TODO: move this to function kwarg when I remove py2 show_progress = kwargs.pop("show_progress", False) n_params = sum([X.shape[1] for X in Xs]) init_values = np.zeros((n_params, )) results = minimize( value_and_grad(self._negative_log_likelihood), init_values, method=None if self.l1_ratio <= 0.0 else "L-BFGS-B", jac=True, args=(T, E, Xs[0], Xs[1]), # TODO: remove py2, (T, E, *Xs) options={"disp": show_progress}, ) if show_progress: print(results) if results.success: # pylint: disable=no-value-for-parameter hessian_ = hessian(self._negative_log_likelihood)(results.x, T, E, *Xs) return results.x, -self._n_examples * results.fun, self._n_examples * hessian_ print(results) raise ConvergenceError( dedent("""\ Fitting did not converge. This could be a problem with your data: 1. Are there any extreme values? (Try modelling them or dropping them to see if it helps convergence) """))
def _fit_model(self, T, E, entry, show_progress=True): non_zero_entries = entry[entry > 0] with warnings.catch_warnings(): warnings.simplefilter("ignore") results = minimize( value_and_grad(self._negative_log_likelihood), # pylint: disable=no-value-for-parameter self._initial_values, jac=True, method="L-BFGS-B", args=(T, E, non_zero_entries), bounds=self._bounds, options={"disp": show_progress}, ) if results.success: # pylint: disable=no-value-for-parameter hessian_ = hessian(self._negative_log_likelihood)( results.x, T, E, non_zero_entries) return results.x, -results.fun, hessian_ * T.shape[0] print(results) raise ConvergenceError( dedent("""\ Fitting did not converge. 1. Are two parameters in the model colinear / exchangeable? (Change model) 2. Is the cumulative hazard always non-negative and always non-decreasing? (Assumption error) 3. Are there inputs to the cumulative hazard that could produce nans or infs? (Check your _bounds) This could be a problem with your data: 1. Are there any extreme values? (Try modelling them or dropping them to see if it helps convergence) """))
def _newton_rhaphson(self, T, E, precision=1e-5, show_progress=False): from lifelines.utils import _smart_search def hessian_function(parameters, T, E): return np.array([[ _d_lambda_d_lambda_(parameters, T, E), _d_rho_d_lambda_(parameters, T, E) ], [ _d_rho_d_lambda_(parameters, T, E), _d_rho_d_rho(parameters, T, E) ]]) def gradient_function(parameters, T, E): return np.array([ _lambda_gradient(parameters, T, E), _rho_gradient(parameters, T, E) ]) # initialize the parameters. This shows dramatic improvements. parameters = _smart_search(_negative_log_likelihood, 2, T, E) i = 1 step_size = 0.9 max_steps = 50 converging, completed = True, False start = time.time() while converging and i < max_steps: # Do not override hessian and gradient in case of garbage h, g = hessian_function(parameters, T, E), gradient_function(parameters, T, E) delta = solve(h, -step_size * g.T) if np.any(np.isnan(delta)): raise ConvergenceError( "delta contains nan value(s). Convergence halted.") parameters += delta # Save these as pending result hessian = h if show_progress: print( "Iteration %d: norm_delta = %.5f, seconds_since_start = %.1f" % (i, norm(delta), time.time() - start)) if norm(delta) < precision: converging = False completed = True i += 1 if show_progress and completed: print("Convergence completed after %d iterations." % (i)) if not completed: warnings.warn( "Newton-Rhapson failed to converge sufficiently in %d steps." % max_steps, ConvergenceWarning) return parameters, hessian
def _fit_model(self, T, E, weights, X, show_progress=False, initial_point=None): if initial_point is None: initial_point = self._create_initial_point(T, E, X) results = minimize( # using value_and_grad is much faster (takes advantage of shared computations) than spitting. value_and_grad(self._negative_log_likelihood), initial_point, method=None, jac=True, args=(T, E, weights, X), options={"disp": show_progress}, ) if show_progress or not results.success: print(results) if results.success: sum_weights = weights.sum() # pylint: disable=no-value-for-parameter hessian_ = hessian(self._negative_log_likelihood)(results.x, T, E, weights, X) return results.x, -sum_weights * results.fun, sum_weights * hessian_ raise ConvergenceError( dedent( """\ Fitting did not converge. This could be a problem with your data: 1. Does a column have extremely high mean or variance? Try standardizing it. 2. Are there any extreme outliers? Try modeling them or dropping them to see if it helps convergence 3. Trying adding a small penalizer (or changing it, if already present). Example: `%s(penalizer=0.01).fit(...)` """ % self._class_name ) )
def _fit_model(self, likelihood, Ts, Xs, E, weights, entries, show_progress=False, initial_point=None): if initial_point is None: initial_point = self._create_initial_point(Xs) assert initial_point.shape[ 0] == Xs.size, "initial_point is not the correct shape." self._neg_likelihood_with_penalty_function = lambda *args: self._add_penalty( -self._wrap_ll(likelihood)(*args), *args) results = minimize( # using value_and_grad is much faster (takes advantage of shared computations) than splitting. value_and_grad(self._neg_likelihood_with_penalty_function), initial_point, method=None, jac=True, args=(Ts, E, weights, entries, Xs), options={"disp": show_progress}, ) if show_progress or not results.success: print(results) if results.success: sum_weights = weights.sum() # pylint: disable=no-value-for-parameter hessian_ = hessian(self._neg_likelihood_with_penalty_function)( results.x, Ts, E, weights, entries, Xs) return results.x, -sum_weights * results.fun, sum_weights * hessian_ name = self._class_name raise ConvergenceError( dedent("""\ Fitting did not converge. This could be a problem with your dataset: 0. Are there any lifelines warnings outputted during the `fit`? 1. Inspect your DataFrame: does everything look as expected? 2. Is there high-collinearity in the dataset? Try using the variance inflation factor (VIF) to find redundant variables. 3. Trying adding a small penalizer (or changing it, if already present). Example: `%s(penalizer=0.01).fit(...)`. 4. Are there any extreme outliers? Try modeling them or dropping them to see if it helps convergence. """ % name))
def _fit_model(self, T, E, initial_values=None): if initial_values is None: initial_values = np.array([log(T).mean(), log(T).std()]) def gradient_function(parameters, log_T, E): return np.array([ _mu_gradient(parameters, log_T, E), _sigma_gradient(parameters, log_T, E) ]) results = minimize( _negative_log_likelihood, initial_values, args=(log(T), E), jac=gradient_function, method="BFGS", options={"gtol": 1e-5}, ) if results.success: return results.x, -results.fun, results.hess_inv print(results) raise ConvergenceError( "Did not converge. This is a lifelines problem, not yours;")
def _newton_rhaphson( self, df, events, start, stop, weights, show_progress=False, step_size=None, precision=10e-6, max_steps=50, initial_point=None, ): # pylint: disable=too-many-arguments,too-many-locals,too-many-branches,too-many-statements """ Newton Rhaphson algorithm for fitting CPH model. Parameters ---------- df: DataFrame stop_times_events: DataFrame meta information about the subjects history show_progress: boolean, optional (default: True) to show verbose output of convergence step_size: float > 0 to determine a starting step size in NR algorithm. precision: float the convergence halts if the norm of delta between successive positions is less than epsilon. Returns -------- beta: (1,d) numpy array. """ assert precision <= 1.0, "precision must be less than or equal to 1." _, d = df.shape # make sure betas are correct size. if initial_point is not None: beta = initial_point else: beta = np.zeros((d, )) i = 0 converging = True ll, previous_ll = 0, 0 start_time = time.time() step_sizer = StepSizer(step_size) step_size = step_sizer.next() while converging: i += 1 if self.strata is None: h, g, ll = self._get_gradients(df.values, events.values, start.values, stop.values, weights.values, beta) else: g = np.zeros_like(beta) h = np.zeros((d, d)) ll = 0 for _h, _g, _ll in self._partition_by_strata_and_apply( df, events, start, stop, weights, self._get_gradients, beta): g += _g h += _h ll += _ll if i == 1 and np.all(beta == 0): # this is a neat optimization, the null partial likelihood # is the same as the full partial but evaluated at zero. # if the user supplied a non-trivial initial point, we need to delay this. self._log_likelihood_null = ll if self.penalizer > 0: # add the gradient and hessian of the l2 term g -= self.penalizer * beta h.flat[::d + 1] -= self.penalizer try: # reusing a piece to make g * inv(h) * g.T faster later inv_h_dot_g_T = spsolve(-h, g, sym_pos=True) except ValueError as e: if "infs or NaNs" in str(e): raise ConvergenceError( """hessian or gradient contains nan or inf value(s). Convergence halted. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model """, e, ) else: # something else? raise e except LinAlgError as e: raise ConvergenceError( """Convergence halted due to matrix inversion problems. Suspicion is high colinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model """, e, ) delta = step_size * inv_h_dot_g_T if np.any(np.isnan(delta)): raise ConvergenceError( """delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model """) # Save these as pending result hessian, gradient = h, g norm_delta = norm(delta) newton_decrement = g.dot(inv_h_dot_g_T) / 2 if show_progress: print( "\rIteration %d: norm_delta = %.5f, step_size = %.5f, ll = %.5f, newton_decrement = %.5f, seconds_since_start = %.1f" % (i, norm_delta, step_size, ll, newton_decrement, time.time() - start_time), end="", ) # convergence criteria if norm_delta < precision: converging, completed = False, True elif previous_ll > 0 and abs(ll - previous_ll) / (-previous_ll) < 1e-09: # this is what R uses by default converging, completed = False, True elif newton_decrement < 10e-8: converging, completed = False, True elif i >= max_steps: # 50 iterations steps with N-R is a lot. # Expected convergence is less than 10 steps converging, completed = False, False elif step_size <= 0.0001: converging, completed = False, False elif abs(ll) < 0.0001 and norm_delta > 1.0: warnings.warn( "The log-likelihood is getting suspiciously close to 0 and the delta is still large. There may be complete separation in the dataset. This may result in incorrect inference of coefficients. \ See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression", ConvergenceWarning, ) converging, completed = False, False step_size = step_sizer.update(norm_delta).next() beta += delta self._hessian_ = hessian self._score_ = gradient self._log_likelihood = ll if show_progress and completed: print("Convergence completed after %d iterations." % (i)) elif show_progress and not completed: print("Convergence failed. See any warning messages.") # report to the user problems that we detect. if completed and norm_delta > 0.1: warnings.warn( "Newton-Rhapson convergence completed but norm(delta) is still high, %.3f. This may imply non-unique solutions to the maximum likelihood. Perhaps there is colinearity or complete separation in the dataset?" % norm_delta, ConvergenceWarning, ) elif not completed: warnings.warn( "Newton-Rhapson failed to converge sufficiently in %d steps." % max_steps, ConvergenceWarning) return beta
def _newton_rhaphson(self, X, T, E, weights=None, initial_beta=None, step_size=None, precision=10e-6, show_progress=True, max_steps=50): """ Newton Rhaphson algorithm for fitting CPH model. Note that data is assumed to be sorted on T! Parameters: X: (n,d) Pandas DataFrame of observations. T: (n) Pandas Series representing observed durations. E: (n) Pandas Series representing death events. weights: (n) an iterable representing weights per observation. initial_beta: (1,d) numpy array of initial starting point for NR algorithm. Default 0. step_size: float > 0.001 to determine a starting step size in NR algorithm. precision: the convergence halts if the norm of delta between successive positions is less than epsilon. show_progress: since the fitter is iterative, show convergence diagnostics. max_steps: the maximum number of interations of the Newton-Rhaphson algorithm. Returns: beta: (1,d) numpy array. """ self.path = [] assert precision <= 1., "precision must be less than or equal to 1." n, d = X.shape # make sure betas are correct size. if initial_beta is not None: assert initial_beta.shape == (d, 1) beta = initial_beta else: beta = np.zeros((d, 1)) step_sizer = StepSizer(step_size) step_size = step_sizer.next() # Method of choice is just efron right now if self.tie_method == 'Efron': get_gradients = self._get_efron_values else: raise NotImplementedError("Only Efron is available.") i = 0 converging = True ll, previous_ll = 0, 0 start = time.time() while converging: self.path.append(beta.copy()) i += 1 if self.strata is None: h, g, ll = get_gradients(X.values, beta, T.values, E.values, weights.values) else: g = np.zeros_like(beta).T h = np.zeros((beta.shape[0], beta.shape[0])) ll = 0 for strata in np.unique(X.index): stratified_X, stratified_T, stratified_E, stratified_W = X.loc[ [strata]], T.loc[[strata ]], E.loc[[strata ]], weights.loc[[strata]] _h, _g, _ll = get_gradients(stratified_X.values, beta, stratified_T.values, stratified_E.values, stratified_W.values) g += _g h += _h ll += _ll if self.penalizer > 0: # add the gradient and hessian of the l2 term g -= self.penalizer * beta.T h.flat[::d + 1] -= self.penalizer # reusing a piece to make g * inv(h) * g.T faster later try: inv_h_dot_g_T = spsolve(-h, g.T, sym_pos=True) except ValueError as e: if 'infs or NaNs' in str(e): raise ConvergenceError( """hessian or gradient contains nan or inf value(s). Convergence halted. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model """) else: # something else? raise e delta = step_size * inv_h_dot_g_T if np.any(np.isnan(delta)): raise ConvergenceError( """delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model """) # Save these as pending result hessian, gradient = h, g norm_delta = norm(delta) # reusing an above piece to make g * inv(h) * g.T faster. newton_decrement = g.dot(inv_h_dot_g_T) / 2 if show_progress: print( "Iteration %d: norm_delta = %.5f, step_size = %.5f, ll = %.5f, newton_decrement = %.5f, seconds_since_start = %.1f" % (i, norm_delta, step_size, ll, newton_decrement, time.time() - start)) # convergence criteria if norm_delta < precision: converging, completed = False, True elif previous_ll != 0 and abs(ll - previous_ll) / ( -previous_ll) < 1e-09: # this is what R uses by default converging, completed = False, True elif newton_decrement < precision: converging, completed = False, True elif i >= max_steps: # 50 iterations steps with N-R is a lot. # Expected convergence is ~10 steps converging, completed = False, False elif step_size <= 0.00001: converging, completed = False, False elif abs(ll) < 0.0001 and norm_delta > 1.0: warnings.warn( "The log-likelihood is getting suspciously close to 0 and the delta is still large. There may be complete separation in the dataset. This may result in incorrect inference of coefficients. \ See https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faqwhat-is-complete-or-quasi-complete-separation-in-logisticprobit-regression-and-how-do-we-deal-with-them/ ", ConvergenceWarning) converging, completed = False, False step_size = step_sizer.update(norm_delta).next() beta += delta previous_ll = ll self._hessian_ = hessian self._score_ = gradient self._log_likelihood = ll if show_progress and completed: print("Convergence completed after %d iterations." % (i)) if not completed: warnings.warn( "Newton-Rhapson failed to converge sufficiently in %d steps." % max_steps, ConvergenceWarning) return beta
def _newton_rhaphson(self, df, stop_times_events, weights, show_progress=False, step_size=None, precision=10e-6, max_steps=50): """ Newton Rhaphson algorithm for fitting CPH model. Note that data is assumed to be sorted on T! Parameters: df: (n, d) Pandas DataFrame of observations stop_times_events: (n, d) Pandas DataFrame of meta information about the subjects history show_progress: True to show verbous output of convergence step_size: float > 0 to determine a starting step size in NR algorithm. precision: the convergence halts if the norm of delta between successive positions is less than epsilon. Returns: beta: (1,d) numpy array. """ assert precision <= 1., "precision must be less than or equal to 1." n, d = df.shape # make sure betas are correct size. beta = np.zeros((d, 1)) i = 0 converging = True ll, previous_ll = 0, 0 start = time.time() step_sizer = StepSizer(step_size) step_size = step_sizer.next() while converging: i += 1 h, g, ll = self._get_gradients(df, stop_times_events, weights, beta) if self.penalizer > 0: # add the gradient and hessian of the l2 term g -= self.penalizer * beta.T h.flat[::d + 1] -= self.penalizer try: # reusing a piece to make g * inv(h) * g.T faster later inv_h_dot_g_T = spsolve(-h, g.T, sym_pos=True) except ValueError as e: if 'infs or NaNs' in str(e): raise ConvergenceError("""hessian or gradient contains nan or inf value(s). Convergence halted. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model """) else: # something else? raise e delta = step_size * inv_h_dot_g_T if np.any(np.isnan(delta)): raise ConvergenceError("""delta contains nan value(s). Convergence halted. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-model """) # Save these as pending result hessian, gradient = h, g norm_delta = norm(delta) newton_decrement = g.dot(inv_h_dot_g_T)/2 if show_progress: print("Iteration %d: norm_delta = %.5f, step_size = %.5f, ll = %.5f, newton_decrement = %.5f, seconds_since_start = %.1f" % (i, norm_delta, step_size, ll, newton_decrement, time.time() - start)) # convergence criteria if norm_delta < precision: converging, completed = False, True elif previous_ll > 0 and abs(ll - previous_ll) / (-previous_ll) < 1e-09: # this is what R uses by default converging, completed = False, True elif newton_decrement < 10e-8: converging, completed = False, True elif i >= max_steps: # 50 iterations steps with N-R is a lot. # Expected convergence is less than 10 steps converging, completed = False, False elif step_size <= 0.0001: converging, completed = False, False elif abs(ll) < 0.0001 and norm_delta > 1.0: warnings.warn("The log-likelihood is getting suspciously close to 0 and the delta is still large. There may be complete separation in the dataset. This may result in incorrect inference of coefficients. \ See https://stats.idre.ucla.edu/other/mult-pkg/faq/general/faqwhat-is-complete-or-quasi-complete-separation-in-logisticprobit-regression-and-how-do-we-deal-with-them/ ", ConvergenceWarning) converging, completed = False, False step_size = step_sizer.update(norm_delta).next() beta += delta self._hessian_ = hessian self._score_ = gradient self._log_likelihood = ll if show_progress and completed: print("Convergence completed after %d iterations." % (i)) if not completed: warnings.warn("Newton-Rhapson failed to converge sufficiently in %d steps." % max_steps, ConvergenceWarning) return beta