def test_risk_slim(data_csv_file, sample_weights_csv_file=None, max_coefficient=5, max_L0_value=5, max_offset=50, c0_value=1e-6, w_pos=1.00, settings=None): # load dataset data = load_data_from_csv(dataset_csv_file=data_csv_file, sample_weights_csv_file=sample_weights_csv_file) N, P = data['X'].shape # offset value coef_set = CoefficientSet(variable_names=data['variable_names'], lb=-max_coefficient, ub=max_coefficient, sign=0) conservative_offset = get_conservative_offset(data, coef_set, max_L0_value) max_offset = min(max_offset, conservative_offset) coef_set['(Intercept)'].ub = max_offset coef_set['(Intercept)'].lb = -max_offset # create constraint dictionary trivial_L0_max = P - np.sum(coef_set.C_0j == 0) max_L0_value = min(max_L0_value, trivial_L0_max) constraints = { 'L0_min': 0, 'L0_max': max_L0_value, 'coef_set': coef_set, } # Train model using lattice_cpa model_info, mip_info, lcpa_info = run_lattice_cpa(data, constraints, settings) #model info contains key results pprint(model_info) # lcpa_output contains detailed information about LCPA pprint(lcpa_info) # todo check solution # mip_output contains information to access the MIP mip_info['risk_slim_mip'] #CPLEX mip mip_info['risk_slim_idx'] #indices of the relevant constraints return True
data_dir = os.getcwd() + '/examples/data/' # directory where datasets are stored data_csv_file = data_dir + data_name + '_data.csv' # csv file for the dataset sample_weights_csv_file = None # csv file of sample weights for the dataset (optional) # problem parameters max_coefficient = 5 # value of largest/smallest coefficient max_L0_value = 5 # maximum model size max_offset = 50 # maximum value of offset parameter (optional) c0_value = 1e-6 # L0-penalty parameter such that c0_value > 0; larger values -> sparser models; we set to a small value (1e-6) so that we get a model with max_L0_value terms w_pos = 1.00 # relative weight on examples with y = +1; w_neg = 1.00 (optional) # load dataset data = load_data_from_csv(dataset_csv_file = data_csv_file, sample_weights_csv_file = sample_weights_csv_file) # coefficient set coef_set = CoefficientSet(variable_names = data['variable_names'], lb=-max_coefficient, ub=max_coefficient, sign=0) # offset value conservative_offset = get_conservative_offset(data, coef_set, max_L0_value) conservative_offset = get_conservative_offset(data, coef_set, max_L0_value) max_offset = min(max_offset, conservative_offset) coef_set['(Intercept)'].ub = max_offset coef_set['(Intercept)'].lb = -max_offset # create constraint dictionary trivial_L0_max = P - np.sum(coef_set.C_0j == 0) max_L0_value = min(max_L0_value, trivial_L0_max) constraints = { 'L0_min': 0, 'L0_max': max_L0_value,
def risk_slim(data, max_coefficient, max_L0_value, c0_value, max_offset, max_runtime = 120, w_pos = 1): """ @parameters: max_coefficient: value of largest/smallest coefficient max_L0_value: maximum model size (set as float(inf)) max_offset: maximum value of offset parameter (optional) c0_value: L0-penalty parameter such that c0_value > 0; larger values -> sparser models; we set to a small value (1e-6) so that we get a model with max_L0_value terms max_runtime: max algorithm running time w_pos: relative weight on examples with y = +1; w_neg = 1.00 (optional) """ # create coefficient set and set the value of the offset parameter coef_set = CoefficientSet(variable_names = data['variable_names'], lb = -max_coefficient, ub = max_coefficient, sign = 0) conservative_offset = get_conservative_offset(data, coef_set, max_L0_value) max_offset = min(max_offset, conservative_offset) coef_set['(Intercept)'].ub = max_offset coef_set['(Intercept)'].lb = -max_offset constraints = { 'L0_min': 0, 'L0_max': max_L0_value, 'coef_set':coef_set, } # Set parameters settings = { # Problem Parameters 'c0_value': c0_value, 'w_pos': w_pos, # LCPA Settings 'max_runtime': max_runtime, # max runtime for LCPA 'max_tolerance': np.finfo('float').eps, # tolerance to stop LCPA (set to 0 to return provably optimal solution) 'display_cplex_progress': True, # print CPLEX progress on screen 'loss_computation': 'lookup', # how to compute the loss function ('normal','fast','lookup') # LCPA Improvements 'round_flag': False, # round continuous solutions with SeqRd 'polish_flag': False, # polish integer feasible solutions with DCD 'chained_updates_flag': False, # use chained updates 'add_cuts_at_heuristic_solutions': True, # add cuts at integer feasible solutions found using polishing/rounding # Initialization 'initialization_flag': True, # use initialization procedure 'init_max_runtime': 300.0, # max time to run CPA in initialization procedure 'init_max_coefficient_gap': 0.49, # CPLEX Solver Parameters 'cplex_randomseed': 0, # random seed 'cplex_mipemphasis': 0, # cplex MIP strategy } # train model using lattice_cpa model_info, mip_info, lcpa_info = run_lattice_cpa(data, constraints, settings) return model_info, mip_info, lcpa_info
def fit(self, X, y): X, y = check_X_y(X, y, accept_sparse=True) self.is_fitted_ = True # transforming data raw_data = np.insert(X, 0, y, axis=1) N = raw_data.shape[0] # setup Y vector and Y_name Y_col_idx = [0] Y = raw_data[:, Y_col_idx] Y_name = self.data_headers[Y_col_idx[0]] Y[Y == 0] = -1 # setup X and X_names X_col_idx = [j for j in range(raw_data.shape[1]) if j not in Y_col_idx] X = raw_data[:, X_col_idx] variable_names = [self.data_headers[j] for j in X_col_idx] # insert a column of ones to X for the intercept X = np.insert(arr=X, obj=0, values=np.ones(N), axis=1) variable_names.insert(0, '(Intercept)') if self.sample_weights is None or len(self.sample_weights) != N: self.sample_weights = np.ones(N) self.data = { 'X': X, 'Y': Y, 'variable_names': variable_names, 'outcome_name': Y_name, 'sample_weights': self.sample_weights, } #load folds if self.fold_csv_file is not None: if not os.path.isfile(self.fold_csv_file): raise IOError('could not find fold_csv_file: %s' % self.fold_csv_file) else: fold_idx = pd.read_csv(self.fold_csv_file, sep=',', header=None) fold_idx = fold_idx.values.flatten() K = max(fold_idx) all_fold_nums = np.sort(np.unique(fold_idx)) assert len( fold_idx ) == N, "dimension mismatch: read %r fold indices (expected N = %r)" % ( len(fold_idx), N) assert np.all(all_fold_nums == np.arange( 1, K + 1)), "folds should contain indices between 1 to %r" % K assert fold_num in np.arange( 0, K + 1 ), "fold_num should either be 0 or an integer between 1 to %r" % K if fold_num >= 1: test_idx = fold_num == fold_idx train_idx = fold_num != fold_idx data['X'] = data['X'][train_idx, ] data['Y'] = data['Y'][train_idx] data['sample_weights'] = data['sample_weights'][train_idx] assert check_data(self.data) # create coefficient set and set the value of the offset parameter coef_set = CoefficientSet(variable_names=self.data['variable_names'], lb=-self.max_coefficient, ub=self.max_coefficient, sign=0) conservative_offset = get_conservative_offset(self.data, coef_set, self.max_L0_value) self.max_offset = min(self.max_offset, conservative_offset) coef_set['(Intercept)'].ub = self.max_offset coef_set['(Intercept)'].lb = -self.max_offset # edit contraints here constraints = { 'L0_min': 0, 'L0_max': self.max_L0_value, 'coef_set': coef_set, } # initialize MIP for lattice CPA mip_objects = setup_lattice_cpa(self.data, constraints, self.settings) # add operational constraints mip = mip_objects['mip'] indices = mip_objects['indices'] get_alpha_name = lambda var_name: 'alpha_' + str(self.data[ 'variable_names'].index(var_name)) get_alpha_ind = lambda var_names: [ get_alpha_name(v) for v in var_names ] # applies mutual exclusivity feature contraints if self.op_constraints is not None: names = [] expressions = [] for key in self.op_constraints.keys(): names.append("mutually_exclusive_%s" % key) expressions.append( cplex.SparsePair( ind=get_alpha_ind(self.op_constraints[key]), val=[1.0] * len(self.op_constraints[key]))) mip.linear_constraints.add( names=names, lin_expr=expressions, senses=["L"] * len(self.op_constraints.keys()), rhs=[1.0] * len(self.op_constraints.keys())) mip_objects['mip'] = mip # fit using ltca model_info, mip_info, lcpa_info = finish_lattice_cpa( self.data, constraints, mip_objects, self.settings) rho = model_info['solution'] self.model_info = model_info if np.sum(rho[1:]) != 0: print_model(model_info['solution'], self.data) print("solver_time = %d" % model_info['solver_time']) print("optimality_gap = %.3f" % model_info['optimality_gap']) print(rho) variable_names = self.data['variable_names'] rho_values = np.copy(rho) rho_names = list(variable_names) # removes intercept value or sets it to 0 if '(Intercept)' in rho_names: intercept_ind = variable_names.index('(Intercept)') self.intercept_val = int(rho[intercept_ind]) rho_values = np.delete(rho_values, intercept_ind) rho_names.remove('(Intercept)') else: self.intercept_val = 0 self.filter_mask = np.array(rho_values) != 0 # removes zero values if not self.show_omitted_variables: selected_ind = np.flatnonzero(rho_values) self.rho_values = rho_values[selected_ind] self.rho_names = [rho_names[i] for i in selected_ind] return self
logger.info("loading data and sample weights") data = load_data_from_csv(dataset_csv_file=parsed.data, sample_weights_csv_file=parsed.weights, fold_csv_file=parsed.cvindices, fold_num=parsed.fold) N, P = data['X'].shape # initialize coefficient set and offset parameter logger.info("creating coefficient set and constraints") max_coefficient = parsed.max_coef max_model_size = parsed.max_size if parsed.max_size >= 0 else float('inf') max_offset = parsed.max_offset if parsed.max_offset >= 0 else float('inf') coef_set = CoefficientSet(variable_names=data['variable_names'], lb=-max_coefficient, ub=max_coefficient, sign=0) coef_set.update_intercept_bounds(X=data['X'], y=data['y'], max_offset=max_offset, max_L0_value=max_model_size) #print coefficient set if not parsed.silent: print(coef_set) constraints = { 'L0_min': 0, 'L0_max': max_model_size, 'coef_set': coef_set, }