def __init__(self, loss, penalty, active_groups, inactive_groups, randomization, solve_args={'min_its':50, 'tol':1.e-10}, beta_active=None): """ penalty is a group_lasso object that assigns weights to groups """ (self.loss, self.penalty, self.active_groups, self.inactive_groups, self.randomization, self.solve_args, self.beta_active) = (loss, penalty, active_groups, inactive_groups, randomization, solve_args, beta_active) self.active = np.zeros(self.loss.shape, np.bool) for i, g in enumerate(np.unique(self.penalty.groups)): if self.active_groups[i]: self.active[self.penalty.groups == g] = True self.inactive = ~self.active # we form a dual group lasso object # to compute the max score new_groups = penalty.groups[self.inactive] new_weights = dict([(g,penalty.weights[g]) for g in penalty.weights.keys() if g in np.unique(new_groups)]) self.group_lasso_dual = rr.group_lasso_dual(new_groups, weights=new_weights, lagrange=1.)
def solve(self, scaling=1, solve_args={'min_its': 20, 'tol': 1.e-10}): self.randomize() (loss, randomized_loss, epsilon, penalty, randomization, solve_args) = (self.loss, self.randomized_loss, self.epsilon, self.penalty, self.randomization, self.solve_args) # initial solution problem = rr.simple_problem(randomized_loss, penalty) self.initial_soln = problem.solve(**solve_args) # find the active groups and their direction vectors # as well as unpenalized groups groups = np.unique(penalty.groups) active_groups = np.zeros(len(groups), np.bool) unpenalized_groups = np.zeros(len(groups), np.bool) active_directions = [] active = np.zeros(loss.shape, np.bool) unpenalized = np.zeros(loss.shape, np.bool) initial_scalings = [] for i, g in enumerate(groups): group = penalty.groups == g active_groups[i] = (np.linalg.norm(self.initial_soln[group]) > 1.e-6 * penalty.weights[g]) and ( penalty.weights[g] > 0) unpenalized_groups[i] = (penalty.weights[g] == 0) if active_groups[i]: active[group] = True z = np.zeros(active.shape, np.float) z[group] = self.initial_soln[group] / np.linalg.norm( self.initial_soln[group]) active_directions.append(z) initial_scalings.append( np.linalg.norm(self.initial_soln[group])) if unpenalized_groups[i]: unpenalized[group] = True # solve the restricted problem self._overall = active + unpenalized self._inactive = ~self._overall self._unpenalized = unpenalized self._active_directions = np.array(active_directions).T self._active_groups = np.array(active_groups, np.bool) self._unpenalized_groups = np.array(unpenalized_groups, np.bool) self.selection_variable = { 'groups': self._active_groups, 'variables': self._overall, 'directions': self._active_directions } # initial state for opt variables initial_subgrad = -( self.randomized_loss.smooth_objective(self.initial_soln, 'grad') + self.randomized_loss.quadratic.objective(self.initial_soln, 'grad')) # the quadratic of a smooth_atom is not included in computing the smooth_objective initial_subgrad = initial_subgrad[self._inactive] initial_unpenalized = self.initial_soln[self._unpenalized] self.observed_opt_state = np.concatenate( [initial_scalings, initial_unpenalized, initial_subgrad], axis=0) # set the _solved bit self._solved = True # Now setup the pieces for linear decomposition (loss, epsilon, penalty, initial_soln, overall, inactive, unpenalized, active_groups, active_directions) = (self.loss, self.epsilon, self.penalty, self.initial_soln, self._overall, self._inactive, self._unpenalized, self._active_groups, self._active_directions) # scaling should be chosen to be Lipschitz constant for gradient of Gaussian part # we are implicitly assuming that # loss is a pairs model _sqrt_scaling = np.sqrt(scaling) _beta_unpenalized = restricted_Mest(loss, overall, solve_args=solve_args) beta_full = np.zeros(overall.shape) beta_full[overall] = _beta_unpenalized _hessian = loss.hessian(beta_full) self._beta_full = beta_full # observed state for score self.observed_score_state = np.hstack([ _beta_unpenalized * _sqrt_scaling, -loss.smooth_objective(beta_full, 'grad')[inactive] / _sqrt_scaling ]) # form linear part self.num_opt_var = p = loss.shape[0] # shorthand for p # (\bar{\beta}_{E \cup U}, N_{-E}, c_E, \beta_U, z_{-E}) # E for active # U for unpenalized # -E for inactive _opt_linear_term = np.zeros( (p, self._active_groups.sum() + unpenalized.sum() + inactive.sum())) _score_linear_term = np.zeros((p, p)) # \bar{\beta}_{E \cup U} piece -- the unpenalized M estimator Mest_slice = slice(0, overall.sum()) _Mest_hessian = _hessian[:, overall] _score_linear_term[:, Mest_slice] = -_Mest_hessian / _sqrt_scaling # N_{-(E \cup U)} piece -- inactive coordinates of score of M estimator at unpenalized solution null_idx = range(overall.sum(), p) inactive_idx = np.nonzero(inactive)[0] for _i, _n in zip(inactive_idx, null_idx): _score_linear_term[_i, _n] = -_sqrt_scaling # c_E piece scaling_slice = slice(0, active_groups.sum()) if len(active_directions) == 0: _opt_hessian = 0 else: _opt_hessian = (_hessian + epsilon * np.identity(p)).dot(active_directions) _opt_linear_term[:, scaling_slice] = _opt_hessian / _sqrt_scaling self.observed_opt_state[scaling_slice] *= _sqrt_scaling # beta_U piece unpenalized_slice = slice(active_groups.sum(), active_groups.sum() + unpenalized.sum()) unpenalized_directions = np.identity(p)[:, unpenalized] if unpenalized.sum(): _opt_linear_term[:, unpenalized_slice] = ( _hessian + epsilon * np.identity(p)).dot(unpenalized_directions) / _sqrt_scaling self.observed_opt_state[unpenalized_slice] *= _sqrt_scaling # subgrad piece subgrad_idx = range( active_groups.sum() + unpenalized.sum(), active_groups.sum() + inactive.sum() + unpenalized.sum()) subgrad_slice = slice( active_groups.sum() + unpenalized.sum(), active_groups.sum() + inactive.sum() + unpenalized.sum()) for _i, _s in zip(inactive_idx, subgrad_idx): _opt_linear_term[_i, _s] = _sqrt_scaling self.observed_opt_state[subgrad_slice] /= _sqrt_scaling # form affine part _opt_affine_term = np.zeros(p) idx = 0 groups = np.unique(penalty.groups) for i, g in enumerate(groups): if active_groups[i]: group = penalty.groups == g _opt_affine_term[group] = active_directions[:, idx][ group] * penalty.weights[g] idx += 1 # two transforms that encode score and optimization # variable roles # later, we will modify `score_transform` # in `linear_decomposition` self.opt_transform = (_opt_linear_term, _opt_affine_term) self.score_transform = (_score_linear_term, np.zeros(_score_linear_term.shape[0])) # now store everything needed for the projections # the projection acts only on the optimization # variables self.scaling_slice = scaling_slice # weights are scaled here because the linear terms scales them by scaling new_groups = penalty.groups[inactive] new_weights = dict([(g, penalty.weights[g] / _sqrt_scaling) for g in penalty.weights.keys() if g in np.unique(new_groups)]) # we form a dual group lasso object # to do the projection self.group_lasso_dual = rr.group_lasso_dual(new_groups, weights=new_weights, bound=1.) self.subgrad_slice = subgrad_slice self._setup = True
def decompose_subgradient(self, conditioning_groups=None, marginalizing_groups=None): """ ADD DOCSTRING conditioning_groups and marginalizing_groups should be disjoint """ groups = np.unique(self.penalty.groups) condition_inactive_groups = np.zeros_like(groups, dtype=bool) if conditioning_groups is None: conditioning_groups = np.zeros_like(groups, dtype=np.bool) if marginalizing_groups is None: marginalizing_groups = np.zeros_like(groups, dtype=np.bool) if np.any(conditioning_groups * marginalizing_groups): raise ValueError( "cannot simultaneously condition and marginalize over a group's subgradient" ) if not self._setup: raise ValueError( 'setup_sampler should be called before using this function') condition_inactive_variables = np.zeros_like(self._inactive, dtype=bool) moving_inactive_groups = np.zeros_like(groups, dtype=bool) moving_inactive_variables = np.zeros_like(self._inactive, dtype=bool) _inactive_groups = ~(self._active_groups + self._unpenalized) inactive_marginal_groups = np.zeros_like(self._inactive, dtype=bool) limits_marginal_groups = np.zeros_like(self._inactive, np.float) for i, g in enumerate(groups): if (_inactive_groups[i]) and conditioning_groups[i]: group = self.penalty.groups == g condition_inactive_groups[i] = True condition_inactive_variables[group] = True elif (_inactive_groups[i]) and (~conditioning_groups[i]) and ( ~marginalizing_groups[i]): group = self.penalty.groups == g moving_inactive_groups[i] = True moving_inactive_variables[group] = True if (_inactive_groups[i]) and marginalizing_groups[i]: group = self.penalty.groups == g inactive_marginal_groups[i] = True limits_marginal_groups[i] = self.penalty.weights[g] opt_linear, opt_offset = self.opt_transform new_linear = np.zeros( (opt_linear.shape[0], (self._active_groups.sum() + self._unpenalized_groups.sum() + moving_inactive_variables.sum()))) new_linear[:, self.scaling_slice] = opt_linear[:, self.scaling_slice] new_linear[:, self.unpenalized_slice] = opt_linear[:, self.unpenalized_slice] inactive_moving_idx = np.nonzero(moving_inactive_variables)[0] subgrad_idx = range( self._active_groups.sum() + self._unpenalized.sum(), self._active_groups.sum() + self._unpenalized.sum() + moving_inactive_variables.sum()) subgrad_slice = subgrad_idx for _i, _s in zip(inactive_moving_idx, subgrad_idx): new_linear[_i, _s] = 1. observed_opt_state = self.observed_opt_state[:( self._active_groups.sum() + self._unpenalized_groups.sum() + moving_inactive_variables.sum())] observed_opt_state[subgrad_idx] = self.initial_subgrad[ moving_inactive_variables] condition_linear = np.zeros( (opt_linear.shape[0], (self._active_groups.sum() + self._unpenalized_groups.sum() + condition_inactive_variables.sum()))) inactive_condition_idx = np.nonzero(condition_inactive_variables)[0] subgrad_condition_idx = range( self._active_groups.sum() + self._unpenalized.sum(), self._active_groups.sum() + self._unpenalized.sum() + condition_inactive_variables.sum()) for _i, _s in zip(inactive_condition_idx, subgrad_condition_idx): condition_linear[_i, _s] = 1. new_offset = condition_linear[:, subgrad_condition_idx].dot( self.initial_subgrad[condition_inactive_variables]) + opt_offset new_opt_transform = (new_linear, new_offset) print("limits marginal groups", limits_marginal_groups) print("inactive marginal groups", inactive_marginal_groups) def _fraction(_cdf, _pdf, full_state_plus, full_state_minus, inactive_marginal_groups): return (np.divide( _pdf(full_state_plus) - _pdf(full_state_minus), _cdf(full_state_plus) - _cdf(full_state_minus)))[inactive_marginal_groups] def new_grad_log_density(query, limits_marginal_groups, inactive_marginal_groups, _cdf, _pdf, opt_linear, deriv_log_dens, internal_state, opt_state): full_state = reconstruct_full_from_internal( new_opt_transform, query.score_transform, internal_state, opt_state) p = query.penalty.shape[0] weights = np.zeros(p) if inactive_marginal_groups.sum() > 0: full_state_plus = full_state + np.multiply( limits_marginal_groups, np.array(inactive_marginal_groups, np.float)) full_state_minus = full_state - np.multiply( limits_marginal_groups, np.array(inactive_marginal_groups, np.float)) weights[inactive_marginal_groups] = _fraction( _cdf, _pdf, full_state_plus, full_state_minus, inactive_marginal_groups) weights[~inactive_marginal_groups] = deriv_log_dens( full_state)[~inactive_marginal_groups] return -opt_linear.T.dot(weights) new_grad_log_density = functools.partial( new_grad_log_density, self, limits_marginal_groups, inactive_marginal_groups, self.randomization._cdf, self.randomization._pdf, new_opt_transform[0], self.randomization._derivative_log_density) def new_log_density(query, limits_marginal_groups, inactive_marginal_groups, _cdf, _pdf, opt_linear, log_dens, internal_state, opt_state): full_state = reconstruct_full_from_internal( new_opt_transform, query.score_transform, internal_state, opt_state) full_state = np.atleast_2d(full_state) p = query.penalty.shape[0] logdens = np.zeros(full_state.shape[0]) if inactive_marginal_groups.sum() > 0: full_state_plus = full_state + np.multiply( limits_marginal_groups, np.array(inactive_marginal_groups, np.float)) full_state_minus = full_state - np.multiply( limits_marginal_groups, np.array(inactive_marginal_groups, np.float)) logdens += np.sum( np.log(_cdf(full_state_plus) - _cdf(full_state_minus))[:, inactive_marginal_groups], axis=1) logdens += log_dens(full_state[:, ~inactive_marginal_groups]) return np.squeeze( logdens ) # should this be negative to match the gradient log density? new_log_density = functools.partial( new_log_density, self, limits_marginal_groups, inactive_marginal_groups, self.randomization._cdf, self.randomization._pdf, self.opt_transform[0], self.randomization._log_density) new_groups = self.penalty.groups[moving_inactive_groups] _sqrt_scaling = np.sqrt(self.scaling) new_weights = dict([(g, self.penalty.weights[g] / _sqrt_scaling) for g in self.penalty.weights.keys() if g in np.unique(new_groups)]) new_group_lasso_dual = rr.group_lasso_dual(new_groups, weights=new_weights, bound=1.) def new_projection(group_lasso_dual, noverall, opt_state): new_state = opt_state.copy() new_state[self.scaling_slice] = np.maximum( opt_state[self.scaling_slice], 0) new_state[noverall:] = group_lasso_dual.bound_prox( opt_state[noverall:]) return new_state new_projection = functools.partial(new_projection, new_group_lasso_dual, self._overall.sum()) new_selection_variable = copy(self.selection_variable) new_selection_variable['subgradient'] = self.observed_opt_state[ self.subgrad_slice] self.sampler = optimization_sampler( observed_opt_state, self.observed_internal_state.copy(), self.score_transform, new_opt_transform, new_projection, new_grad_log_density, new_log_density, selection_info=(self, new_selection_variable))
def setup_sampler(self, solve_args={'min_its':50, 'tol':1.e-10}): """ Should return a bootstrap_score """ (loss, epsilon, penalty, randomization, initial_soln, overall, inactive, unpenalized, active_groups, active_directions) = (self.loss, self.epsilon, self.penalty, self.randomization, self.initial_soln, self.overall, self.inactive, self.unpenalized, self.active_groups, self.active_directions) # we are implicitly assuming that # loss is a pairs model _beta_unpenalized = restricted_Mest(loss, overall, solve_args=solve_args) beta_full = np.zeros(overall.shape) beta_full[overall] = _beta_unpenalized _hessian = loss.hessian(beta_full) self._beta_full = beta_full # observed state for score self.observed_score_state = np.hstack([_beta_unpenalized, -loss.smooth_objective(beta_full, 'grad')[inactive]]) # form linear part self.num_opt_var = p = loss.shape[0] # shorthand for p # (\bar{\beta}_{E \cup U}, N_{-E}, c_E, \beta_U, z_{-E}) # E for active # U for unpenalized # -E for inactive _opt_linear_term = np.zeros((p, self.active_groups.sum() + unpenalized.sum() + inactive.sum())) _score_linear_term = np.zeros((p, p)) # \bar{\beta}_{E \cup U} piece -- the unpenalized M estimator Mest_slice = slice(0, overall.sum()) _Mest_hessian = _hessian[:,overall] _score_linear_term[:,Mest_slice] = -_Mest_hessian # N_{-(E \cup U)} piece -- inactive coordinates of score of M estimator at unpenalized solution null_idx = range(overall.sum(), p) inactive_idx = np.nonzero(inactive)[0] for _i, _n in zip(inactive_idx, null_idx): _score_linear_term[_i,_n] = -1. # c_E piece scaling_slice = slice(0, active_groups.sum()) _opt_hessian = (_hessian + epsilon * np.identity(p)).dot(active_directions) _opt_linear_term[:,scaling_slice] = _opt_hessian # beta_U piece unpenalized_slice = slice(active_groups.sum(), active_groups.sum() + unpenalized.sum()) unpenalized_directions = np.identity(p)[:,unpenalized] if unpenalized.sum(): _opt_linear_term[:,unpenalized_slice] = (_hessian + epsilon * np.identity(p)).dot(unpenalized_directions) # subgrad piece subgrad_idx = range(active_groups.sum() + unpenalized.sum(), active_groups.sum() + inactive.sum() + unpenalized.sum()) subgrad_slice = slice(active_groups.sum() + unpenalized.sum(), active_groups.sum() + inactive.sum() + unpenalized.sum()) for _i, _s in zip(inactive_idx, subgrad_idx): _opt_linear_term[_i,_s] = 1. # form affine part _opt_affine_term = np.zeros(p) idx = 0 groups = np.unique(penalty.groups) for i, g in enumerate(groups): if active_groups[i]: group = penalty.groups == g _opt_affine_term[group] = active_directions[:,idx][group] * penalty.weights[g] idx += 1 # two transforms that encode score and optimization # variable roles # later, conditioning will modify `score_transform` self.opt_transform = (_opt_linear_term, _opt_affine_term) self.score_transform = (_score_linear_term, np.zeros(_score_linear_term.shape[0])) # now store everything needed for the projections # the projection acts only on the optimization # variables self.scaling_slice = scaling_slice new_groups = penalty.groups[inactive] new_weights = dict([(g,penalty.weights[g]) for g in penalty.weights.keys() if g in np.unique(new_groups)]) # we form a dual group lasso object # to do the projection self.group_lasso_dual = rr.group_lasso_dual(new_groups, weights=new_weights, bound=1.) self.subgrad_slice = subgrad_slice