def __init__(self, layers): ''' layers: A list of Pytorch layers containing only Linear/ReLU/MaxPools ''' self.layers = layers self.net = nn.Sequential(*layers) # Initialize a LinearizedNetwork object to determine the lower and # upper bounds at each layer. self.lin_net = LinearizedNetwork(layers)
def bab(gt_prop, verif_layers, domain, return_dict, timeout, batch_size, method, tot_iter, parent_init, args, gurobi_dict=None, writer=None): epsilon = 1e-4 if gpu: cuda_verif_layers = [copy.deepcopy(lay).cuda() for lay in verif_layers] domain = domain.cuda() else: cuda_verif_layers = [copy.deepcopy(lay) for lay in verif_layers] # use best of naive interval propagation and KW as intermediate bounds intermediate_net = SaddleLP(cuda_verif_layers, store_bounds_primal=False, max_batch=args.max_solver_batch) intermediate_net.set_solution_optimizer('best_naive_kw', None) anderson_bounds_net = None hard_crit = None prob_hard_crit = None # might need a smaller batch size for hard domains hard_batch_size = batch_size if args.hard_batch_size == -1 else args.hard_batch_size # Split domains into easy and hard, define two separate bounding methods to handle their last layer. if method in ["cut", "gurobi-anderson"]: # Set criteria for identifying subproblems as hard hard_crit = { "lb_threshold": 0.5, "depth_threshold": 0, # 15 "impr_threshold": 1e-1, "doms_len_threshold": 200, "auto": args.auto_strat, "hard_overhead": args.hard_overhead, # assumed at full batch } # Set bounds net for easy domains. if method in ["cut"]: bigm_adam_params = { "bigm_algorithm": "adam", "bigm": "only", "nb_outer_iter": int(tot_iter), # cifar_oval: 180 'initial_step_size': args.dualinit_init_step, # cifar_oval: 1e-2 'initial_step_size_pinit': args.dualinit_init_step / 10, 'final_step_size': args.dualinit_fin_step, # cifar_oval: 1e-4 'betas': (0.9, 0.999) } bounds_net = ExpLP(cuda_verif_layers, params=bigm_adam_params, store_bounds_primal=True) else: bounds_net = LinearizedNetwork(verif_layers) # Set bounds net for hard domains. if method == "cut": anderson_iter = args.hard_iter # 100 explp_params = { "nb_iter": anderson_iter, 'bigm': "init", 'cut': "only", "bigm_algorithm": "adam", 'cut_frequency': 450, 'max_cuts': 8, 'cut_add': args.cut_add, # 2 'betas': (0.9, 0.999), 'initial_step_size': args.init_step, 'final_step_size': args.fin_step, "init_params": { "nb_outer_iter": 500, #500 for our datasets, 1000 for cifar10_8_255 'initial_step_size': args.dualinit_init_step, 'initial_step_size_pinit': args.dualinit_init_step / 10, 'final_step_size': args.dualinit_fin_step, 'betas': (0.9, 0.999), }, } anderson_bounds_net = ExpLP(cuda_verif_layers, params=explp_params, fixed_M=True, store_bounds_primal=True) print(f"Running cut for {anderson_iter} iterations") elif method == "gurobi-anderson": anderson_bounds_net = AndersonLinearizedNetwork( verif_layers, mode="lp-cut", n_cuts=args.n_cuts, cuts_per_neuron=True, decision_boundary=decision_bound) if args.no_easy: # Ignore the easy problems bounding, use the hard one for all. bounds_net = anderson_bounds_net anderson_bounds_net = None # Use only a single last layer bounding method for all problems. elif method == "prox": bounds_net = SaddleLP(cuda_verif_layers, store_bounds_primal=True, max_batch=args.max_solver_batch) bounds_net.set_decomposition('pairs', 'KW') optprox_params = { 'nb_total_steps': int(tot_iter), 'max_nb_inner_steps': 2, # this is 2/5 as simpleprox 'initial_eta': args.eta, 'final_eta': args.feta, 'log_values': False, 'maintain_primal': True } bounds_net.set_solution_optimizer('optimized_prox', optprox_params) print(f"Running prox with {tot_iter} steps") elif method == "adam": bounds_net = SaddleLP(cuda_verif_layers, store_bounds_primal=True, max_batch=args.max_solver_batch) bounds_net.set_decomposition('pairs', 'KW') adam_params = { 'nb_steps': int(tot_iter), 'initial_step_size': args.init_step, 'final_step_size': args.fin_step, 'betas': (0.9, 0.999), 'log_values': False } bounds_net.set_solution_optimizer('adam', adam_params) print(f"Running adam with {tot_iter} steps") elif method == "bigm-adam": bigm_adam_params = { "bigm_algorithm": "adam", "bigm": "only", "nb_outer_iter": int(tot_iter), 'initial_step_size': args.init_step, 'initial_step_size_pinit': args.init_step / 10, 'final_step_size': args.fin_step, 'betas': (0.9, 0.999) } bounds_net = ExpLP(cuda_verif_layers, params=bigm_adam_params, store_bounds_primal=True) elif method == "gurobi": bounds_net = LinearizedNetwork(verif_layers) # branching if args.branching_choice == 'heuristic': branching_net_name = None else: raise NotImplementedError # try: with torch.no_grad(): min_lb, min_ub, ub_point, nb_states, fail_safe_ratio = relu_bab( intermediate_net, bounds_net, branching_net_name, domain, decision_bound, eps=epsilon, timeout=timeout, batch_size=batch_size, parent_init_flag=parent_init, gurobi_specs=gurobi_dict, anderson_bounds_net=anderson_bounds_net, writer=writer, hard_crit=hard_crit, hard_batch_size=hard_batch_size) if not (min_lb or min_ub or ub_point): return_dict["min_lb"] = None return_dict["min_ub"] = None return_dict["ub_point"] = None return_dict["nb_states"] = nb_states return_dict["bab_out"] = "timeout" return_dict["fs_ratio"] = fail_safe_ratio else: return_dict["min_lb"] = min_lb.cpu() return_dict["min_ub"] = min_ub.cpu() return_dict["ub_point"] = ub_point.cpu() return_dict["nb_states"] = nb_states return_dict["fs_ratio"] = fail_safe_ratio
def reluify_maxpool(layers, domain, no_opt=False): ''' Remove all the Maxpool units of a feedforward network represented by `layers` and replace them by an equivalent combination of ReLU + Linear This is only valid over the domain `domain` because we use some knowledge about upper and lower bounds of certain neurons Args: no_opt: Boolean. If set to True, don't optimize the bounds to convert the maxpool into ReLU and use interval_analysis. If set to False, will use the tight optimized bounds. ''' if no_opt: # We're building a MIPNetwork but we are not going to solve it. This is just # because this is the class that has the code for interval_analysis # TODO: Importing here sucks but avoiding it and importing at the top level # would mean a larger refactoring that I'm willing to do right now. from plnn.mip_solver import MIPNetwork mip_net = MIPNetwork(layers) mip_net.do_interval_analysis(domain) lbs = mip_net.lower_bounds else: # We will need some lower bounds for the inputs to the maxpooling # We will simply use those given by a LinearizedNetwork lin_net = LinearizedNetwork(layers) lin_net.define_linear_approximation(domain) lbs = lin_net.lower_bounds layers = layers[:] new_all_layers = [] idx_of_inp_lbs = 0 layer_idx = 0 while layer_idx < len(layers): layer = layers[layer_idx] if type(layer) is nn.MaxPool1d: # We need to decompose this MaxPool until it only has a size of 2 assert layer.padding == 0 assert layer.dilation == 1 if layer.kernel_size > 2: assert layer.kernel_size % 2 == 0, "Not supported yet" assert layer.stride % 2 == 0, "Not supported yet" # We're going to decompose this maxpooling into two maxpooling # max( in_1, in_2 , in_3, in_4) # will become # max( max(in_1, in_2), max(in_3, in_4)) first_mp = nn.MaxPool1d(2, stride=2) second_mp = nn.MaxPool1d(layer.kernel_size // 2, stride=layer.stride // 2) # We will replace the Maxpooling that was originally there with # those two layers # We need to add a corresponding layer of lower bounds first_lbs = lbs[idx_of_inp_lbs] intermediate_lbs = [] for pair_idx in range(len(first_lbs) // 2): intermediate_lbs.append( max(first_lbs[2 * pair_idx], first_lbs[2 * pair_idx + 1])) # Do the replacement del layers[layer_idx] layers.insert(layer_idx, first_mp) layers.insert(layer_idx + 1, second_mp) lbs.insert(idx_of_inp_lbs + 1, intermediate_lbs) # Now continue so that we re-go through the loop with the now # simplified maxpool continue elif layer.kernel_size == 2: # Each pair need two in the intermediate layers that is going # to be Relu-ified pre_nb_inp_lin = len(lbs[idx_of_inp_lbs]) # How many starting position can we fit in? # 1 + how many stride we can fit before we're too late in the array to fit a kernel_size pre_nb_out_lin = (1 + ( (pre_nb_inp_lin - layer.kernel_size) // layer.stride)) * 2 pre_relu_lin = nn.Linear(pre_nb_inp_lin, pre_nb_out_lin, bias=True) pre_relu_weight = pre_relu_lin.weight.data pre_relu_bias = pre_relu_lin.bias.data pre_relu_weight.zero_() pre_relu_bias.zero_() # For each of (x, y) that needs to be transformed to max(x, y) # We create (x-y, y-y_lb) first_in_index = 0 first_out_index = 0 while first_in_index + 1 < pre_nb_inp_lin: pre_relu_weight[first_out_index, first_in_index] = 1 pre_relu_weight[first_out_index, first_in_index + 1] = -1 pre_relu_weight[first_out_index + 1, first_in_index + 1] = 1 pre_relu_bias[first_out_index + 1] = -lbs[idx_of_inp_lbs][first_in_index + 1] # Now shift first_in_index += layer.stride first_out_index += 2 new_all_layers.append(pre_relu_lin) new_all_layers.append(nn.ReLU()) # We now need to create the second layer # It will sum [max(x-y, 0)], [max(y - y_lb, 0)] and y_lb post_nb_inp_lin = pre_nb_out_lin post_nb_out_lin = post_nb_inp_lin // 2 post_relu_lin = nn.Linear(post_nb_inp_lin, post_nb_out_lin) post_relu_weight = post_relu_lin.weight.data post_relu_bias = post_relu_lin.bias.data post_relu_weight.zero_() post_relu_bias.zero_() first_in_index = 0 out_index = 0 while first_in_index + 1 < post_nb_inp_lin: post_relu_weight[out_index, first_in_index] = 1 post_relu_weight[out_index, first_in_index + 1] = 1 post_relu_bias[out_index] = lbs[idx_of_inp_lbs][ layer.stride * out_index + 1] first_in_index += 2 out_index += 1 new_all_layers.append(post_relu_lin) idx_of_inp_lbs += 1 else: # This should have been cleaned up in one of the simplify passes raise NotImplementedError elif type(layer) in [nn.Linear, nn.ReLU]: new_all_layers.append(layer) idx_of_inp_lbs += 1 elif type(layer) is View: # We shouldn't add the view as we are getting rid of them pass layer_idx += 1 return new_all_layers
def main(): parser = argparse.ArgumentParser( description="Compute and time a bunch of bounds.") parser.add_argument('eps', type=float, help='Epsilon - default: 0.1') parser.add_argument('target_directory', type=str, help='Where to store the results') parser.add_argument('--modulo', type=int, help='Numbers of a job to split the dataset over.') parser.add_argument('--modulo_do', type=int, help='Which job_id is this one.') parser.add_argument( '--from_intermediate_bounds', action='store_true', help= "if this flag is true, intermediate bounds are computed w/ best of naive-KW" ) parser.add_argument('--network', type=str, help='which network to use', default="wide", choices=["wide", "deep"]) args = parser.parse_args() results_dir = args.target_directory os.makedirs(results_dir, exist_ok=True) testset_size = int(1e5) for idx in range(testset_size): if (args.modulo is not None) and (idx % args.modulo != args.modulo_do): continue target_dir = os.path.join(results_dir, f"{idx}") os.makedirs(target_dir, exist_ok=True) X, y, elided_models = load_mnist_wide_net(idx, mnist_test=None) if X is None: continue elided_model = elided_models[y] to_ignore = y domain = torch.stack([ torch.clamp(X.squeeze(0) - args.eps, 0, None), torch.clamp(X.squeeze(0) + args.eps, None, 1.0) ], -1).unsqueeze(0) lin_approx_string = "" if not args.from_intermediate_bounds else "-fromintermediate" # compute intermediate bounds with KW. Use only these for every method to allow comparison on the last layer # and optimize only the last layer if args.from_intermediate_bounds: cuda_elided_model = copy.deepcopy(elided_model).cuda() cuda_domain = domain.cuda() intermediate_net = SaddleLP([lay for lay in cuda_elided_model]) with torch.no_grad(): intermediate_net.set_solution_optimizer('best_naive_kw', None) intermediate_net.define_linear_approximation( cuda_domain, no_conv=False, override_numerical_errors=True) intermediate_ubs = intermediate_net.upper_bounds intermediate_lbs = intermediate_net.lower_bounds ## Proximal methods for optprox_steps in [400]: optprox_params = { 'nb_total_steps': optprox_steps, 'max_nb_inner_steps': 2, # this is 2/5 as simpleprox 'initial_eta': 1e0, 'final_eta': 5e1, 'log_values': False, 'inner_cutoff': 0, 'maintain_primal': True, 'acceleration_dict': { 'momentum': 0.3, # decent momentum: 0.9 w/ increasing eta } } optprox_target_file = os.path.join( target_dir, f"Proximal_finalmomentum_{optprox_steps}{lin_approx_string}.txt" ) if not os.path.exists(optprox_target_file): cuda_elided_model = copy.deepcopy(elided_model).cuda() cuda_domain = domain.cuda() optprox_net = SaddleLP([lay for lay in cuda_elided_model]) optprox_start = time.time() with torch.no_grad(): optprox_net.set_decomposition('pairs', 'KW') optprox_net.set_solution_optimizer('optimized_prox', optprox_params) if not args.from_intermediate_bounds: optprox_net.define_linear_approximation(cuda_domain, no_conv=False) ub = optprox_net.upper_bounds[-1] else: optprox_net.build_model_using_bounds( cuda_domain, (intermediate_lbs, intermediate_ubs)) _, ub = optprox_net.compute_lower_bound() optprox_end = time.time() optprox_time = optprox_end - optprox_start optprox_ubs = ub.cpu() del optprox_net dump_bounds(optprox_target_file, optprox_time, optprox_ubs) ## Gurobi PLANET Bounds grb_target_file = os.path.join(target_dir, f"Gurobi{lin_approx_string}-fixed.txt") if not os.path.exists(grb_target_file): grb_net = LinearizedNetwork([lay for lay in elided_model]) grb_start = time.time() if not args.from_intermediate_bounds: grb_net.define_linear_approximation(domain[0], n_threads=4) ub = grb_net.upper_bounds[-1] else: grb_net.build_model_using_bounds( domain[0], ([lbs[0].cpu() for lbs in intermediate_lbs ], [ubs[0].cpu() for ubs in intermediate_ubs]), n_threads=4) _, ub = grb_net.compute_lower_bound(ub_only=True) grb_end = time.time() grb_time = grb_end - grb_start grb_ubs = torch.Tensor(ub).cpu() dump_bounds(grb_target_file, grb_time, grb_ubs) ## Cuts for cut_steps in [80, 600, 1050, 1650, 2500]: explp_params = { "nb_iter": cut_steps, 'bigm': "init", 'cut': "only", "bigm_algorithm": "adam", 'cut_frequency': 450, 'max_cuts': 12, 'cut_add': 2, 'betas': (0.9, 0.999), 'initial_step_size': 1e-3, 'final_step_size': 1e-6, "init_params": { "nb_outer_iter": 500, 'initial_step_size': 1e-1, 'final_step_size': 1e-3, 'betas': (0.9, 0.999) }, } cut_target_file = os.path.join( target_dir, f"Cuts_{cut_steps}{lin_approx_string}.txt") if not os.path.exists(cut_target_file): cuda_elided_model = copy.deepcopy(elided_model).cuda() cuda_domain = domain.cuda() exp_net = ExpLP([lay for lay in cuda_elided_model], params=explp_params, use_preactivation=True, fixed_M=True) exp_start = time.time() with torch.no_grad(): if not args.from_intermediate_bounds: exp_net.define_linear_approximation(cuda_domain) ub = exp_net.upper_bounds[-1] else: exp_net.build_model_using_bounds( cuda_domain, (intermediate_lbs, intermediate_ubs)) _, ub = exp_net.compute_lower_bound() exp_end = time.time() exp_time = exp_end - exp_start exp_ubs = ub.cpu() del exp_net dump_bounds(cut_target_file, exp_time, exp_ubs) # Big-M supergradient. (iters tuned to take same time as prox) for bigm_steps in [850]: bigm_adam_params = { "bigm_algorithm": "adam", "bigm": "only", "nb_outer_iter": bigm_steps, 'initial_step_size': 1e-1, 'final_step_size': 1e-3, 'betas': (0.9, 0.999) } bigm_target_file = os.path.join( target_dir, f"Big-M_{bigm_steps}{lin_approx_string}.txt") if not os.path.exists(bigm_target_file): cuda_elided_model = copy.deepcopy(elided_model).cuda() cuda_domain = domain.cuda() bigm_net = ExpLP([lay for lay in cuda_elided_model], params=bigm_adam_params, use_preactivation=True, fixed_M=True) bigm_start = time.time() with torch.no_grad(): if not args.from_intermediate_bounds: bigm_net.define_linear_approximation(cuda_domain) ub = bigm_net.upper_bounds[-1] else: bigm_net.build_model_using_bounds( cuda_domain, (intermediate_lbs, intermediate_ubs)) _, ub = bigm_net.compute_lower_bound() bigm_end = time.time() bigm_time = bigm_end - bigm_start bigm_ubs = ub.cpu() del bigm_net dump_bounds(bigm_target_file, bigm_time, bigm_ubs) ## Gurobi Anderson Bounds for n_cuts in [1]: grb_and_target_file = os.path.join( target_dir, f"Anderson-{n_cuts}cuts{lin_approx_string}-fixed.txt") if not os.path.exists(grb_and_target_file): lp_and_grb_net = AndersonLinearizedNetwork( [lay for lay in elided_model], mode="lp-cut", n_cuts=n_cuts, cuts_per_neuron=True) lp_and_grb_start = time.time() if not args.from_intermediate_bounds: lp_and_grb_net.define_linear_approximation(domain[0], n_threads=4) ub = lp_and_grb_net.upper_bounds[-1] else: lp_and_grb_net.build_model_using_bounds( domain[0], ([lbs[0].cpu() for lbs in intermediate_lbs ], [ubs[0].cpu() for ubs in intermediate_ubs]), n_threads=4) _, ub = lp_and_grb_net.compute_lower_bound(ub_only=True) lp_and_grb_end = time.time() lp_and_grb_time = lp_and_grb_end - lp_and_grb_start lp_and_grb_ubs = torch.Tensor(ub).cpu() dump_bounds(grb_and_target_file, lp_and_grb_time, lp_and_grb_ubs)