示例#1
0
def get_predictive_percentile_calibration(runs, percentile,
                                          method='exact', max_Ntest=50):
  model = models.PoissonRegressionModel(None,
                                        None,
                                        example_weights=None,
                                        test_data=None)
  in_interval = []
  for run in runs:
    if method == 'exact':
      bootstrap_samples = run['multinomial']['bootstrap_params_exact']
    elif method == 'appx':
      bootstrap_samples = run['multinomial']['bootstrap_params_appx']

    test_data = run['test_data']
    test_data.Y = test_data.Y[:max_Ntest]
    test_data.X = test_data.X[:max_Ntest]
    test_data.N = test_data.X.shape[0]
    model.test_data = test_data
    Ntest = test_data.N
    
    for n in range(Ntest):
      sampled_ys = model.get_predictive_distribution(bootstrap_samples,
                                                     model.test_data.X[n],
                                                     Nsamples=100)
      true_y = model.test_data.Y[n]
      lower = np.percentile(sampled_ys, tail)
      upper = np.percentile(sampled_ys, 100-tail)

      in_interval_for_run[n] = (lower <= true_y) and (true_y <= upper)
    in_interval.append(in_interval_for_run)
  return in_interval
示例#2
0
def get_distances(data, all_pairs):

    # Extracts the quantile distances between patients
    history = get_history_length(data)
    q0_set = np.percentile(all_pairs, 5, axis=1, keepdims=True)
    q1_set = np.percentile(all_pairs, 10, axis=1, keepdims=True)
    q2_set = np.percentile(all_pairs, 15, axis=1, keepdims=True)
    quantiles = np.hstack((history, q0_set, q1_set, q2_set))
    return quantiles
示例#3
0
def get_orthogonality_score(C_matrix, verbose=True):
    """
    Gets the angle between each subspace and the other ones.

    Note the we leave the diagonal as zeros, because the angles are 1 anyway
    And it helps to have a more representative mean.
    """

    in_degree = True

    len_1, len_2 = C_matrix.shape
    orthogonality_matrix = np.zeros((len_2, len_2))

    for lat_i in range(0, len_2):
        for lat_j in range(lat_i + 1, len_2):
            angle = np.dot(C_matrix[:, lat_i], C_matrix[:, lat_j]) / (np.dot(
                np.linalg.norm(C_matrix[:, lat_i]),
                np.linalg.norm(C_matrix[:, lat_j])))
            orthogonality_matrix[lat_i, lat_j] = np.arccos(np.abs(angle))
            orthogonality_matrix[lat_j, lat_i] = np.arccos(np.abs(angle))

    if in_degree:
        orthogonality_matrix = 180 * orthogonality_matrix / np.pi

    mean_per_sub_space = np.sum(np.abs(orthogonality_matrix), 1) / (len_2 - 1)

    glob_mean = np.mean(mean_per_sub_space)

    try:
        all_non_diag = orthogonality_matrix.flatten()
        all_non_diag = all_non_diag[np.nonzero(all_non_diag)]

        tenth_percentil = np.percentile(all_non_diag, 25)
        ninetith_percentil = np.percentile(all_non_diag, 75)

        small_avr = np.average(
            all_non_diag,
            weights=(all_non_diag <= tenth_percentil).astype(int))
        high_avr = np.average(
            all_non_diag,
            weights=(all_non_diag >= ninetith_percentil).astype(int))
    except:
        small_avr = glob_mean
        high_avr = glob_mean

    if verbose:
        print(np.around(orthogonality_matrix, 2))
        print("Mean abs angle per subspace: ", mean_per_sub_space)
        print("Mean abs angle overall: ", glob_mean)
        #print("Std abs angle overall: ", np.std(mean_per_sub_space))

    # print(small_avr, high_avr)
    if len_2 <= 1:
        glob_mean = small_avr = high_avr = 0

    return glob_mean, small_avr, high_avr
def plot_summary(x,
                 s,
                 interval=95,
                 num_samples=100,
                 sample_color='k',
                 sample_alpha=0.4,
                 interval_alpha=0.25,
                 color='r',
                 legend=True,
                 title="",
                 plot_mean=True,
                 plot_median=False,
                 label=""):
    b = 0.5 * (100 - interval)

    lower = np.percentile(s, b, axis=0).T
    upper = np.percentile(s, 100 - b, axis=0).T

    if plot_median:
        median = np.percentile(s, [50], axis=0).T
        lab = 'Median'
        if len(label) > 0:
            lab += " %s" % label
        plt.plot(x.ravel(), median, label=lab, color=color, linewidth=4)

    if plot_mean:
        mean = np.mean(s, axis=0).T
        lab = 'Mean'
        if len(label) > 0:
            lab += " %s" % label
        plt.plot(x.ravel(), mean, '--', label=lab, color=color, linewidth=4)
    plt.fill_between(x.ravel(),
                     lower.ravel(),
                     upper.ravel(),
                     color=color,
                     alpha=interval_alpha,
                     label='%d%% Interval' % interval)

    if num_samples > 0:
        idx_samples = np.random.choice(range(len(s)),
                                       size=num_samples,
                                       replace=False)
        plt.plot(x,
                 s[idx_samples, :].T,
                 color=sample_color,
                 alpha=sample_alpha)

    if legend:
        plt.legend(loc='best')

    if len(title) > 0:
        plt.title(title, fontweight='bold')
示例#5
0
    def callback(self, th, t, g, tskip=20, n_samps=10):
        """ custom callback --- prints statistics of all gradient comps"""
        if t % tskip == 0:
            fval = self.elbo_mc(th, n_samps=n_samps)
            gm, gv = np.abs(g[:self.D]), np.abs(g[self.D:])
            print \
"""
iter {t}; val = {val}, abs gm = {m} [{mlo}, {mhi}]
                           gv = {v} [{vlo}, {vhi}]
""".format(t=t, val="%2.4f"%fval,
                m  ="%2.4f"%np.mean(gm),
                mlo="%2.4f"%np.percentile(gm, 1.),
                mhi="%2.4f"%np.percentile(gm, 99.),
                v  ="%2.4f"%np.mean(gv),
                vlo="%2.4f"%np.percentile(gv, 1.),
                vhi="%2.4f"%np.percentile(gv, 99.))
示例#6
0
def post_plot(ax, dist):
    ax.hist(dist, 50, histtype="step")
    ylim = ax.get_ylim()
    low, mid, high = np.percentile(dist, [16, 50, 84])
    plt.plot([mid, mid], ylim, c='indianred')
    for lh in (low, high):
        plt.plot([lh, lh], ylim, ':', c='indianred')
    ax.set_xlabel('Rotation Period (days)', fontsize=14)
    ax.set_ylabel('Posterior Probability', fontsize=14)
示例#7
0
def get_likelihood_based_interval(thetas, interval_coverage, model):
  '''
  thetas should be a list of free parameters for the model
  q should be in [0,100]

  Returns list of thetas, starting with lowest model.eval_objective(theta)
  up to the qth percentile.
  '''
  #fn_vals = [model.eval_objective(theta) for theta in thetas]
  #sorted_inds = np.array(np.argsort(fn_vals))
  #thresh = int(np.floor(thetas.shape[0]*(q/100.0)))
  #return thetas[sorted_inds[:thresh]], np.array(fn_vals)[sorted_inds]

  fn_vals = np.array([model.eval_objective(theta) for theta in thetas])

  upper = np.percentile(fn_vals, interval_coverage+(100-interval_coverage)/2)
  lower = np.percentile(fn_vals, (100-interval_coverage)/2)
  inds = np.where(np.logical_and(lower <= fn_vals, fn_vals <= upper))
  return fn_vals[inds]
示例#8
0
    def initialize(self,
                   base_model,
                   datas,
                   inputs=None,
                   masks=None,
                   tags=None,
                   emission_optimizer="bfgs",
                   num_optimizer_iters=1000):
        print("Initializing Emissions parameters...")

        if self.D == 1 and base_model.transitions.__class__.__name__ == "DDMTransitions":
            # if self.D == 0:
            d_init = np.mean([y[0:3] for y in datas], axis=(0, 1))
            u_sum = np.array([np.sum(u) for u in inputs])
            y_end = np.array([y[-3:] for y in datas])
            u_l, u_u = np.percentile(
                u_sum, [20, 80])  # use 20th and 80th percentile input
            y_U = y_end[np.where(u_sum >= u_u)]
            y_L = y_end[np.where(u_sum <= u_l)]
            C_init = (1.0 / 2.0) * np.mean(
                (np.mean(y_U, axis=0) - np.mean(y_L, axis=0)), axis=0)
            self.Cs = C_init.reshape([1, self.N, self.D]) / self.bin_size
            self.ds = d_init.reshape([1, self.N]) / self.bin_size

        else:
            datas = [
                interpolate_data(data, mask)
                for data, mask in zip(datas, masks)
            ]

            Td = sum([data.shape[0] for data in datas])
            xs = [
                base_model.sample(T=data.shape[0], input=input)[1]
                for data, input in zip(datas, inputs)
            ]

            def _objective(params, itr):
                self.params = params
                # self.Cs = params
                obj = 0
                obj += self.log_prior()
                for data, input, mask, tag, x in \
                    zip(datas, inputs, masks, tags, xs):
                    obj += np.sum(
                        self.log_likelihoods(data, input, mask, tag, x))
                return -obj / Td

            # Optimize emissions log-likelihood
            optimizer = dict(bfgs=bfgs, lbfgs=lbfgs)[emission_optimizer]
            self.params = \
                optimizer(_objective,
                          self.params,
                          num_iters=num_optimizer_iters,
                          full_output=False)
示例#9
0
    def callback(self, th, t, g, tskip=20, n_samps=100):
        """ custom callback --- prints statistics of all gradient comps"""
        if t % tskip == 0:
            fval = self.elbo_mc(th, n_samps=n_samps)
            gm, gv, gC = self.unpack(g)
            gm, gv, gC = np.abs(gm), np.abs(gv), np.abs(gC)

            m, v, C = self.unpack(th)
            Cmags   = np.sqrt(np.sum(C**2, axis=0))

            if self.r > 0:
                Cm ="%2.4f"%np.mean(gC),
                Clo="%2.4f"%np.percentile(gC, 1.),
                Chi="%2.4f"%np.percentile(gC, 99.),
            else:
                 Cm, Clo, Chi = "na", "na", "na"

            print \
"""
iter {t}; val = {val},
          abs gm         = {m} [{mlo}, {mhi}]
          gv             = {v} [{vlo}, {vhi}]
          gC ({D} x {r}) = {C} [{Clo}, {Chi}]
          Comp mags      = {Cmags} 

""".format(t=t, val="%2.4f"%fval,
                D  = "%d"%self.D, r="%d"%self.r,
                m  ="%2.4f"%np.mean(gm),
                mlo="%2.4f"%np.percentile(gm, 1.),
                mhi="%2.4f"%np.percentile(gm, 99.),
                v  ="%2.4f"%np.mean(gv),
                vlo="%2.4f"%np.percentile(gv, 1.),
                vhi="%2.4f"%np.percentile(gv, 99.),
                C  =Cm, Clo=Clo, Chi=Chi,
                Cmags=np.str(Cmags))
示例#10
0
def get_percentile_calibration(true_params, bs_runs,
                                 interval_coverage=90):
  '''
  Currently checks percentile estimates over each dimension of the parameters
  independently (so we only have to compute precentiles over 1-D things)

  true_params should be a D dimensional array
  bootstrap_samples should be a list of B x D arrays, where B is the number of
    bootstrap samples.
  interval_coverage specifies the size of the interval around the median;
    i.e. 95 corresponds to the interval [2.5%, 97.5%]
  '''
  D = true_params.shape[0]
  nExp = len(bs_runs)
  in_range = np.zeros((nExp,D), dtype=np.bool)
  tail = (100-interval_coverage)/2.0

  for n in range(nExp):
    lower = np.percentile(bs_runs[n]['multinomial']['bootstrap_params_appx'], tail, axis=0)
    upper = np.percentile(bs_runs[n]['multinomial']['bootstrap_params_appx'], 100-tail, axis=0)
    for d in range(D):
      in_range[n,d] = (lower[d] < true_params[d]) and (true_params[d] < upper[d])
  return in_range
示例#11
0
    def __init__(self,data_obj,p=1,oversampled=0,t_offset=None,precomputed=None,pct_spike=95):
        # some fns. require 'precomputed', a dict with at least two keys theta_star (output of lbfgsb) and fn_obj used in the optimiztion of theta_star
        # t_offset: if oversampled, this is shape (N,), and gives the offset between stim trigger and frame (nbefore).
        self.data_obj = data_obj
        self.F = data_obj.F
        self.nroi = self.F.shape[0]
        self.p = p
        self.b = np.zeros((self.nroi,1,1))
        self.g = np.zeros((self.nroi,self.p,1))
        self.a = np.zeros((self.nroi,1,1))
        self.sn = np.zeros((self.nroi,1,1))
        fudge_factor = .97
        for i in range(self.nroi):
            _,s,self.b[i,0,0],gtemp,_  = deconvolve(data_obj.dfof[i].astype(np.float64),penalty=1,g=tuple([None]*self.p))
            self.g[i,:,0] = np.array(gtemp)
            self.a[i] = np.percentile(s,pct_spike)
            est = estimate_parameters(data_obj.dfof[i].astype(np.float64), p=self.p, fudge_factor=fudge_factor)
            self.sn[i] = est[1]
#        if not type(g) is tuple:
#            g = (g,)
#        self.g = np.array(g)
        #self.fn_obj = fn_obj
        #nangle = len(np.unique(data_obj.angle))
        self.noise = (self.sn**2*(1+(self.g**2).sum(1)[:,np.newaxis]))
        self.smax = 5
        #self.fn_obj.compute_helper_vars(data_obj,self)
        ##self.pFs = [self.p_F_given_s(s) for s in range(self.smax)]
        self.log_pFs = [self.log_p_F_given_s(s) for s in range(self.smax)]
        self.oversampled = oversampled
        if self.oversampled:
            self.sampwt = np.ones((self.oversampled,1))/self.oversampled
            self.sampmat = np.zeros((self.oversampled*(self.F.shape[0]-1),self.F.shape[1]),dtype='bool')
            dig = np.floor(self.oversampled*t_offset).astype('<i2')
            for i in range(self.sampmat.shape[1]):
                self.sampmat[dig::self.oversampled,i] = 1
        if precomputed:
            theta_star = precomputed['theta_star']
            fn_obj = precomputed['fn_obj']
            self.rpre = np.zeros(np.array(self.F.shape)+np.array((0,-1,0))) # one fewer time point required
            for i in range(self.nroi):
                self.rpre[i] = fn_obj.rfunc(theta_star[i][0])
示例#12
0
def getquantile(x, lower=0.025, upper=0.975, return_indices=False):
    """ Indicates which elements of `x` fall into a quantile range
    
    Arguments: 

        x: `ndarray(nsamples)`
        lower: `0<=float<max(upper,1)`. Lower quantile
        upper: `min(0, lower)<float<=1`. Upper quantile
        return_indices: `bool`. If `False`, returns boolean array. If `True` returns indices for entries of `x` falling between `lower` and `upper`.
    
    Returns: 

        `ndarray`. Dimensionality will depend on `return_indices`

    """
    lb, ub = np.percentile(x, [lower * 100, upper * 100])
    y = np.logical_and(np.greater_equal(x, lb), np.less(x, ub))

    if return_indices:
        y = np.arange(x.size)[y]

    return y
			coef, intercept = baseRegression.adjust_coef(self, w)
		else: # self.prob_func_ == "softmax"
			coef = np.divide(w[:-1].T, self.scaler_.scale_)
			intercept = w[-1] - np.sum(coef * self.scaler_.mean_)
		if self.penalty_ == "l1":
			# ===FIXME===
			# I don't now the condition to shrink the coef to 0
			coef = np.array([0.0 if abs(wi) < 0.1 else wi for wi in coef])
			intercept = 0.0 if abs(intercept) < 0.1 else intercept
		return coef, intercept
	def predict(self, x):
		if self.prob_func_ == "sigmoid":
			prob = (1.0 / (1.0 + np.exp(-np.dot(x, self.coef_) - self.intercept_)))[:,np.newaxis]
			prob = np.concatenate((1.0-prob, prob), axis=1)
		else: # self.prob_func_ == "softmax"
			prob = np.exp(np.dot(x, self.coef_.T) + self.intercept_)
			prob /= np.sum(prob, axis=1)[:,np.newaxis]
		return np.array([self.classes_[i] for i in np.argmax(prob, axis=1)])
	def score(self, x, y):
		return self.accuracy(x, y)

if __name__ == "__main__":
	from sklearn.datasets import make_regression
	x, y_orig = make_regression(n_samples=10, n_features=5, n_informative=5, n_targets=1, noise=1.0, random_state=1)
#	y = np.array([1 if v >= np.mean(y_orig) else 0 for v in y_orig])
	y = np.array([0 if y < np.percentile(y_orig, 25) else 1 if y < np.percentile(y_orig, 50) else 2 if y < np.percentile(y_orig, 75) else 3 for y in y_orig])
	lr = LogisticRegression()
	lr.fit(x, y)
	print(lr.coef_, lr.intercept_)
	print(lr.score(x, y))
    def fit(
        self,
        df,
        duration_col=None,
        event_col=None,
        show_progress=False,
        timeline=None,
        weights_col=None,
        robust=False,
        initial_point=None,
    ):
        """
        Fit the accelerated failure time model to a dataset.

        Parameters
        ----------
        df: DataFrame
            a Pandas DataFrame with necessary columns `duration_col` and
            `event_col` (see below), covariates columns, and special columns (weights).
            `duration_col` refers to
            the lifetimes of the subjects. `event_col` refers to whether
            the 'death' events was observed: 1 if observed, 0 else (censored).

        duration_col: string
            the name of the column in DataFrame that contains the subjects'
            lifetimes.

        event_col: string, optional
            the  name of the column in DataFrame that contains the subjects' death
            observation. If left as None, assume all individuals are uncensored.

        show_progress: boolean, optional (default=False)
            since the fitter is iterative, show convergence
            diagnostics. Useful if convergence is failing.

        timeline: array, optional
            Specify a timeline that will be used for plotting and prediction

        weights_col: string
            the column in df that specifies weights per observation.

        robust: boolean, optional (default=False)
            Compute the robust errors using the Huber sandwich estimator.

        initial_point: (d,) numpy array, optional
            initialize the starting point of the iterative
            algorithm. Default is the zero vector.

        Returns
        -------
        self:
            self with additional new properties: ``print_summary``, ``params_``, ``confidence_intervals_`` and more


        Examples
        --------
        TODO
        >>> from lifelines import WeibullAFTFitter
        >>>
        >>> df = pd.DataFrame({
        >>>     'T': [5, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
        >>>     'E': [1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0],
        >>>     'var': [0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2],
        >>>     'age': [4, 3, 9, 8, 7, 4, 4, 3, 2, 5, 6, 7],
        >>> })
        >>>
        >>> aft = WeibullAFTFitter()
        >>> aft.fit(df, 'T', 'E')
        >>> aft.print_summary()
        >>> aft.predict_median(df)
        >>>
        >>> aft = WeibullAFTFitter()
        >>> aft.fit(df, 'T', 'E', ancillary_df=df)
        >>> aft.print_summary()
        >>> aft.predict_median(df)

        """
        if duration_col is None:
            raise TypeError("duration_col cannot be None.")

        self._time_fit_was_called = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S") + " UTC"
        self.duration_col = duration_col
        self.event_col = event_col
        self.weights_col = weights_col
        self._n_examples = df.shape[0]
        self.timeline = timeline
        self.robust = robust

        df = df.copy()

        T = pass_for_numeric_dtypes_or_raise_array(
            df.pop(duration_col)).astype(float)
        E = (pass_for_numeric_dtypes_or_raise_array(df.pop(
            self.event_col)).astype(bool) if (self.event_col is not None) else
             pd.Series(np.ones(self._n_examples, dtype=bool),
                       index=df.index,
                       name="E"))
        weights = (pass_for_numeric_dtypes_or_raise_array(
            df.pop(self.weights_col)).astype(float) if
                   (self.weights_col is not None) else pd.Series(
                       np.ones(self._n_examples, dtype=float),
                       index=df.index,
                       name="weights"))
        # check to make sure their weights are okay
        if self.weights_col:
            if (weights.astype(int) != weights).any() and not self.robust:
                warnings.warn(
                    dedent(
                        """It appears your weights are not integers, possibly propensity or sampling scores then?
                                        It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to
                                        estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"""
                    ),
                    StatisticalWarning,
                )
            if (weights <= 0).any():
                raise ValueError(
                    "values in weight column %s must be positive." %
                    self.weights_col)

        self.durations = T.copy()
        self.event_observed = E.copy()
        self.weights = weights.copy()

        if np.any(self.durations <= 0):
            raise ValueError(
                "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements."
            )

        df = df.astype(float)
        self._check_values(df, T, E, self.event_col)

        if self.fit_intercept:
            assert "_intercept" not in df
            df["_intercept"] = 1.0

        self._LOOKUP_SLICE = self._create_slicer(len(df.columns))  # TODO

        _norm_std = df.std(0)
        self._norm_mean = df.mean(0)

        # if we included an intercept, we need to fix not divide by zero.
        if self.fit_intercept:
            _norm_std["_intercept"] = 1.0
        else:
            _norm_std[_norm_std < 1e-8] = 1.0

        _index = pd.MultiIndex.from_tuples(
            sum([[(name, c) for c in df.columns]
                 for name in self._fitted_parameter_names], []))

        self._norm_std = pd.Series(np.concatenate([_norm_std.values] *
                                                  self.n_breakpoints),
                                   index=_index)

        _params, self._log_likelihood, self._hessian_ = self._fit_model(
            T.values,
            E.values,
            weights.values,
            normalize(df, 0, _norm_std).values,
            show_progress=show_progress,
            initial_point=initial_point,
        )
        self.params_ = _params / self._norm_std

        self.variance_matrix_ = self._compute_variance_matrix()
        self.standard_errors_ = self._compute_standard_errors(
            T.values, E.values, weights.values, df.values)
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self._predicted_cumulative_hazard_ = self.predict_cumulative_hazard(
            df, times=[np.percentile(T, 75)]).T

        return self
示例#15
0
    def initialize(self,
                   base_model,
                   datas,
                   inputs=None,
                   masks=None,
                   tags=None,
                   num_em_iters=50,
                   num_tr_iters=50):

        print("Initializing...")
        print("First with FA using {} steps of EM.".format(num_em_iters))
        fa, xhats, Cov_xhats, lls = factor_analysis_with_imputation(
            self.D, datas, masks=masks, num_iters=num_em_iters)

        if self.D == 1 and base_model.transitions.__class__.__name__ == "DDMTransitions":

            d_init = np.mean([y[0:3] for y in datas], axis=(0, 1))
            u_sum = np.array([np.sum(u) for u in inputs])
            y_end = np.array([y[-3:] for y in datas])
            u_l, u_u = np.percentile(
                u_sum, [20, 80])  # use 20th and 80th percentile input
            y_U = y_end[np.where(u_sum >= u_u)]
            y_L = y_end[np.where(u_sum <= u_l)]
            C_init = (1.0 / 2.0) * np.mean(
                (np.mean(y_U, axis=0) - np.mean(y_L, axis=0)), axis=0)

            self.Cs = C_init.reshape([1, self.N, self.D])
            self.ds = d_init.reshape([1, self.N])
            self.inv_etas = np.log(fa.sigmasq).reshape([1, self.N])

        else:

            # define objective
            Td = sum([x.shape[0] for x in xhats])

            def _objective(params, itr):
                new_datas = [np.dot(x, params[0].T) + params[1] for x in xhats]
                obj = base_model.log_likelihood(new_datas, inputs=inputs)
                return -obj / Td

            # initialize R and r
            R = 0.1 * np.random.randn(self.D, self.D)
            r = 0.01 * np.random.randn(self.D)
            params = [R, r]

            print(
                "Next by transforming latents to match AR-HMM prior using {} steps of max log likelihood."
                .format(num_tr_iters))
            state = None
            lls = [-_objective(params, 0) * Td]
            pbar = trange(num_tr_iters)
            pbar.set_description("Epoch {} Itr {} LP: {:.1f}".format(
                0, 0, lls[-1]))

            for itr in pbar:
                params, val, g, state = sgd_step(value_and_grad(_objective),
                                                 params, itr, state)
                lls.append(-val * Td)
                pbar.set_description("LP: {:.1f}".format(lls[-1]))
                pbar.update(1)

            R = params[0]
            r = params[1]

            # scale x's to be max at 1.1
            for d in range(self.D):
                x_transformed = [(np.dot(x, R.T) + r)[:, d] for x in xhats]
                max_x = np.max(x_transformed)
                R[d, :] *= 1.1 / max_x
                r[d] *= 1.1 / max_x

            self.Cs = (fa.W @ np.linalg.inv(R)).reshape([1, self.N, self.D])
            self.ds = fa.mean - fa.W @ np.linalg.inv(R) @ r
            self.inv_etas = np.log(fa.sigmasq).reshape([1, self.N])
示例#16
0
 def set_knots(self, T, E):
     self.knots = np.percentile(np.log(T[E.astype(bool).values]),
                                np.linspace(5, 95, self.n_baseline_knots))
示例#17
0
    def offline_evaluation(self, metrics, mode, data, parameter, confidence=0.95, bootstrap=False, n_bootstrap=1):
        """ Performs offline evaluation

        Args:
            metrics (dic): metrics dictionnary to be filled
            mode (str): train, valid or test split
            data (tuple): tuple of np.arrays with features, actions, rewards
            parameter (np.array): optimized parameter or any baseline parameter
            confidence (float): confidence level for the interval
            bootstrap (bool): choose whether to perform bootstrap or not
            n_bootstrap (int): number of bootstrap folds

        Note:
            Computes ips, snips scores. Also computes t-student test, std, bootstrap std on ips and snips, and importance
            sampling diagnostics

        Returns:
            metrics (dic): contains results information on the data split
        """
        features, actions, rewards, pi_logging = data
        rng_bootstrap = np.random.RandomState(1)
        bootstrap_ips_metric = []
        bootstrap_snips_metric = []
        bootstrap_delta_snips_metric = []
        bootstrap_t_h = []
        bootstrap_std_h = []
        bootstrap_em_diagnostic = []
        bootstrap_ess_diagnostic = []

        for n in range(n_bootstrap):
            idx = rng_bootstrap.choice(np.arange(features.shape[0]), size=features.shape[0], replace=bootstrap)
            ips_metric, snips_metric = self.get_ips_and_snips_metrics(parameter, features[idx], actions[idx],
                                                                      rewards[idx], pi_logging[idx])
            loss_logging = np.mean(-rewards[idx])
            bootstrap_ips_metric.append(ips_metric)
            bootstrap_snips_metric.append(snips_metric)
            bootstrap_delta_snips_metric.append(snips_metric - loss_logging)

            # Student-t distribution test
            n = self.impt_smplg_weight.shape[0]
            se = sp.stats.sem(self.impt_smplg_weight*rewards)
            t_h = se * sp.stats.t.ppf((1 + confidence) / 2., n - 1)
            # Gaussian distribution test
            std_h = np.std(self.impt_smplg_weight)

            bootstrap_t_h.append(t_h)
            bootstrap_std_h.append(std_h)

            # Diagnostics
            empirical_mean_diagnostic = np.mean(self.impt_smplg_weight)
            effective_sample_size_diagnostic = (np.sum(self.impt_smplg_weight)**2/(np.sum(self.impt_smplg_weight**2)+EPS))/n

            bootstrap_em_diagnostic.append(empirical_mean_diagnostic)
            bootstrap_ess_diagnostic.append(effective_sample_size_diagnostic)

        metrics['ips_{}'.format(mode)] = np.mean(bootstrap_ips_metric)
        metrics['snips_{}'.format(mode)] = np.mean(bootstrap_snips_metric)
        metrics['t_h_{}'.format(mode)] = np.mean(bootstrap_t_h)
        metrics['std_h_{}'.format(mode)] = np.mean(bootstrap_std_h)
        metrics['bootstrap_std_ips_{}'.format(mode)] = np.std(bootstrap_ips_metric)
        metrics['bootstrap_h25_ips_{}'.format(mode)] = np.percentile(bootstrap_ips_metric, 2.5)
        metrics['bootstrap_h975_ips_{}'.format(mode)] = np.percentile(bootstrap_ips_metric, 97.5)
        metrics['bootstrap_std_snips_{}'.format(mode)] = np.std(bootstrap_snips_metric)
        metrics['bootstrap_h25_snips_{}'.format(mode)] = np.percentile(bootstrap_snips_metric, 2.5)
        metrics['bootstrap_h975_snips_{}'.format(mode)] = np.percentile(bootstrap_snips_metric, 97.5)
        metrics['em_diagnostic_{}'.format(mode)] = np.mean(bootstrap_em_diagnostic)
        metrics['ess_diagnostic_{}'.format(mode)] = np.mean(bootstrap_ess_diagnostic)

        metrics['snips_delta_{}'.format(mode)] = np.mean(bootstrap_delta_snips_metric)

        metrics['bootstrap_delta_std_snips_{}'.format(mode)] = np.std(bootstrap_delta_snips_metric)
        metrics['bootstrap_delta_h25_snips_{}'.format(mode)] = np.percentile(bootstrap_delta_snips_metric, 2.5)
        metrics['bootstrap_delta_h975_snips_{}'.format(mode)] = np.percentile(bootstrap_delta_snips_metric, 97.5)


        return metrics
示例#18
0
def _quantile_knots(low, high, x, num_bases, degree):
    num_interior_knots = num_bases - (degree + 1)
    clipped = x[(x >= low) & (x <= high)]
    knots = np.percentile(clipped, np.linspace(0, 100, num_interior_knots + 2))
    knots = [low] + list(knots[1:-1]) + [high]
    return np.asarray(knots)
示例#19
0
    def fit(
        self,
        df,
        duration_col=None,
        event_col=None,
        show_progress=False,
        timeline=None,
        weights_col=None,
        robust=False,
        initial_point=None,
    ):
        """
        Fit the accelerated failure time model to a dataset.

        Parameters
        ----------
        df: DataFrame
            a Pandas DataFrame with necessary columns `duration_col` and
            `event_col` (see below), covariates columns, and special columns (weights).
            `duration_col` refers to
            the lifetimes of the subjects. `event_col` refers to whether
            the 'death' events was observed: 1 if observed, 0 else (censored).

        duration_col: string
            the name of the column in DataFrame that contains the subjects'
            lifetimes.

        event_col: string, optional
            the  name of the column in DataFrame that contains the subjects' death
            observation. If left as None, assume all individuals are uncensored.

        show_progress: boolean, optional (default=False)
            since the fitter is iterative, show convergence
            diagnostics. Useful if convergence is failing.

        timeline: array, optional
            Specify a timeline that will be used for plotting and prediction

        weights_col: string
            the column in df that specifies weights per observation.

        robust: boolean, optional (default=False)
            Compute the robust errors using the Huber sandwich estimator.

        initial_point: (d,) numpy array, optional
            initialize the starting point of the iterative
            algorithm. Default is the zero vector.

        Returns
        -------
        self:
            self with additional new properties: ``print_summary``, ``params_``, ``confidence_intervals_`` and more


        Examples
        --------

        >>> N, d = 80000, 2
        >>> # some numbers take from http://statwonk.com/parametric-survival.html
        >>> breakpoints = (1, 31, 34, 62, 65)
        >>> betas = np.array(
        >>>     [
        >>>         [1.0, -0.2, np.log(15)],
        >>>         [5.0, -0.4, np.log(333)],
        >>>         [9.0, -0.6, np.log(18)],
        >>>         [5.0, -0.8, np.log(500)],
        >>>         [2.0, -1.0, np.log(20)],
        >>>         [1.0, -1.2, np.log(500)],
        >>>     ]
        >>> )

        >>> X = 0.1 * np.random.exponential(size=(N, d))
        >>> X = np.c_[X, np.ones(N)]
        >>> T = np.empty(N)
        >>> for i in range(N):
        >>>     lambdas = np.exp(-betas.dot(X[i, :]))
        >>>     T[i] = piecewise_exponential_survival_data(1, breakpoints, lambdas)[0]
        >>> T_censor = np.minimum(
        >>>     T.mean() * np.random.exponential(size=N), 110
        >>> )  # 110 is the end of observation, eg. current time.
        >>> df = pd.DataFrame(X[:, :-1], columns=["var1", "var2"])
        >>> df["T"] = np.round(np.maximum(np.minimum(T, T_censor), 0.1), 1)
        >>> df["E"] = T <= T_censor

        >>> pew = PiecewiseExponentialRegressionFitter(breakpoints=breakpoints, penalizer=0.0001).fit(df, "T", "E")
        >>> pew.print_summary()
        >>> pew.plot()

        """
        if duration_col is None:
            raise TypeError("duration_col cannot be None.")

        self._time_fit_was_called = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") + " UTC"
        self.duration_col = duration_col
        self.event_col = event_col
        self.weights_col = weights_col
        self._n_examples = df.shape[0]
        self.timeline = timeline
        self.robust = robust

        df = df.copy()

        T = pass_for_numeric_dtypes_or_raise_array(df.pop(duration_col)).astype(float)
        E = (
            pass_for_numeric_dtypes_or_raise_array(df.pop(self.event_col))
            if (self.event_col is not None)
            else pd.Series(np.ones(self._n_examples, dtype=bool), index=df.index, name="E")
        )
        weights = (
            pass_for_numeric_dtypes_or_raise_array(df.pop(self.weights_col)).astype(float)
            if (self.weights_col is not None)
            else pd.Series(np.ones(self._n_examples, dtype=float), index=df.index, name="weights")
        )
        # check to make sure their weights are okay
        if self.weights_col:
            if (weights.astype(int) != weights).any() and not self.robust:
                warnings.warn(
                    dedent(
                        """It appears your weights are not integers, possibly propensity or sampling scores then?
                                        It's important to know that the naive variance estimates of the coefficients are biased. Instead a) set `robust=True` in the call to `fit`, or b) use Monte Carlo to
                                        estimate the variances. See paper "Variance estimation when using inverse probability of treatment weighting (IPTW) with survival analysis"""
                    ),
                    StatisticalWarning,
                )
            if (weights <= 0).any():
                raise ValueError("values in weight column %s must be positive." % self.weights_col)

        df = df.astype(float)
        self._check_values(df, T, E, self.event_col)

        E = E.astype(bool)
        self.durations = T.copy()
        self.event_observed = E.copy()
        self.weights = weights.copy()

        if np.any(self.durations <= 0):
            raise ValueError(
                "This model does not allow for non-positive durations. Suggestion: add a small positive value to zero elements."
            )

        if self.fit_intercept:
            assert "_intercept" not in df
            df["_intercept"] = 1.0

        self._LOOKUP_SLICE = self._create_slicer(len(df.columns))

        _norm_std = df.std(0)
        self._norm_mean = df.mean(0)

        # if we included an intercept, we need to fix not divide by zero.
        if self.fit_intercept:
            _norm_std["_intercept"] = 1.0
        else:
            _norm_std[_norm_std < 1e-8] = 1.0

        _index = pd.MultiIndex.from_tuples(
            sum([[(name, c) for c in df.columns] for name in self._fitted_parameter_names], [])
        )

        self._norm_std = pd.Series(np.concatenate([_norm_std.values] * self.n_breakpoints), index=_index)

        _params, self._log_likelihood, self._hessian_ = self._fit_model(
            T.values,
            E.values,
            weights.values,
            normalize(df, 0, _norm_std).values,
            show_progress=show_progress,
            initial_point=initial_point,
        )
        self.params_ = _params / self._norm_std

        self.variance_matrix_ = self._compute_variance_matrix()
        self.standard_errors_ = self._compute_standard_errors(T.values, E.values, weights.values, df.values)
        self.confidence_intervals_ = self._compute_confidence_intervals()
        self._predicted_cumulative_hazard_ = self.predict_cumulative_hazard(df, times=[np.percentile(T, 75)]).T

        return self
示例#20
0
        nsamps = 1000
        z = mogsamples(nsamps, theta)
        lls = moglogpdf(z, theta)
        Hmc = -np.mean(lls)
        Hmc_hi = Hmc + 3 * np.std(lls) / np.sqrt(nsamps)

        # compute bound and store gap
        Hbound = lower_bound_MoG(theta)
        gaps[i] = Hmc - Hbound

        # Hmc should be greater than Hbound
        assert Hmc_hi > Hbound, "bound isn't lower ya dope (%2.3f not greater than %2.3f)" % (
            Hmc_hi, Hbound)

    print "Gap percentiles [1, 50, 99] %s" % str(
        np.percentile(gaps, [1, 50, 99]))

    #########################################
    # test per mu_n function and gradient   #
    #########################################
    n = 0
    lbn, lbs = make_lower_bound_MoGn(theta, n, s2min=1e-7)
    thn = theta[n, :D]
    assert np.isclose(lower_bound_MoG(theta), lbn(thn)), "per n is bad"
    from autograd.util import quick_grad_check, nd
    quick_grad_check(lbn, thn)

    print "Hessiandiag, numeric hessian diag"
    hlbn = hessian(lbn)
    print np.diag(hlbn(thn))