def gen_model_dict(bopmodel, dir): # generates dictionary of models and according to their BOPS values, given a folder containing the model files # Provide an instance of the model you're loading (to calculate # params) # and the path to folder containing model files, returns a dict with the format {BOPS:path to model} # and the total count of params in that model. Excepts if a model with a different total param count is found model_dict = {} first = True total_param = 0 if os.path.isdir(dir): print("Directory found! Loading dir: " + dir) dir_list = os.listdir(dir) dir_list.sort() #print(dir_list) for file in dir_list: try: bopmodel.load_state_dict( torch.load(os.path.join(dir, file), map_location=device)) count, total_cnt, _, _ = countNonZeroWeights(bopmodel) bops = calc_BOPS(bopmodel) if first: #Assume first alphabetical is the first model, for the sake of checking all pth are same model total_param = total_cnt first = False else: if total_cnt != total_param: raise RuntimeError( "Error! Model mismatch while creating model dict! Expected {} total params, found {}" .format(total_param, total_cnt)) model_dict.update({int(bops): file}) except Exception as e: print("Warning! Failed to load file " + file) print(e) return model_dict, total_param else: raise RuntimeError("Error! Unable to find directory " + dir)
def gen_bo_model_dict(dir, bits=32, loadmodel=None): # Modified to load a set of BO models (with varying layer sizes) # Provide an instance of the model you're loading (to calculate # params) # and the path to folder containing model files, returns a dict with the format {pruned params:path to model} # and the total count of params in that model. model_dict = {} model_sizes = [] first = True total_param = 0 if bits > 9: bitlen = 2 else: bitlen = 1 if os.path.isdir(dir): print("Directory found! Loading dir: " + dir) dir_list = os.listdir(dir) dir_list.sort() for file in dir_list: try: #sizestr = re.search('(\d\d?-\d\d?-\d\d?_)',file).group().strip('_').replace('-',', ') #Get the model side from the filename, just saves a bunch of headache sizestr1 = file[6 + bitlen:] sizestr = sizestr1[:-5] dims = [int(m) for m in sizestr.replace(' ', '').split(',')] print(dims) prune_masks = { "fc1": torch.ones(dims[0], 16), "fc2": torch.ones(dims[1], dims[0]), "fc3": torch.ones(dims[2], dims[1]), "fc4": torch.ones(5, dims[2]) } if bits < 32: bomodel = models.three_layer_model_bv_tunable( prune_masks, dims, bits) else: bomodel = models.three_layer_model_tunable( prune_masks, dims) # 32b, non quantized model bomodel.load_state_dict( torch.load(os.path.join(dir, file), map_location=device)) count, total_param, _, _ = countNonZeroWeights(bomodel) bops = calc_BOPS(bomodel) model_dict.update({int(bops): file}) except Exception as e: print("Warning! Failed to load file " + file) print(e) return model_dict, total_param else: raise RuntimeError("Error! Unable to find directory " + dir)
def post_bo_train(dims, train_loader, val_loader, eval_loader, best=False): val_losses = [] train_losses = [] roc_auc_scores = [] avg_precision_scores = [] avg_train_losses = [] avg_valid_losses = [] accuracy_scores = [] iter_eff = [] early_stopping = EarlyStopping(patience=options.patience, verbose=True) # dims = {'fc1':dims[0],'fc2':dims[1],'fc3':dims[2]} model, prune_mask = create_model(dims, post=True) dims_str = str([dims['fc1s'], dims['fc2s'], dims['fc3s']]) model.update_masks( prune_mask) # Make sure to update the masks within the model optimizer = optim.Adam(model.parameters(), lr=0.0001) criterion = nn.BCELoss() L1_factor = 0.0001 # Default Keras L1 Loss estop = False epoch_counter = 0 model.to(device) model.mask_to_device(device) print( "~~~~~~~~~~~~~~~~~Starting Post BO Training for Model Size {}~~~~~~~~~~~~~~~~~~~~" .format(dims)) if options.efficiency_calc and epoch_counter == 0: # Get efficiency of un-initalized model aiq_dict, aiq_time = calc_AiQ(model, eval_loader, True, device=device) epoch_eff = aiq_dict['net_efficiency'] iter_eff.append(aiq_dict) print('[epoch 0] Model Efficiency: %.7f' % epoch_eff) for layer in aiq_dict["layer_metrics"]: print('[epoch 0]\t Layer %s Efficiency: %.7f' % (layer, aiq_dict['layer_metrics'][layer]['efficiency'])) for epoch in range(options.epochs): # loop over the dataset multiple times epoch_counter += 1 # Train model, train_losses = train(model, optimizer, criterion, train_loader, L1_factor=L1_factor) # Validate val_losses, val_avg_precision_list, val_roc_auc_scores_list = val( model, criterion, val_loader, L1_factor=L1_factor) # Calculate average epoch statistics try: train_loss = np.average(train_losses) except: train_loss = torch.mean(torch.stack(train_losses)).cpu().numpy() try: valid_loss = np.average(val_losses) except: valid_loss = torch.mean(torch.stack(val_losses)).cpu().numpy() val_roc_auc_score = np.average(val_roc_auc_scores_list) val_avg_precision = np.average(val_avg_precision_list) if options.efficiency_calc: aiq_dict, aiq_time = calc_AiQ(model, eval_loader, True, device=device) epoch_eff = aiq_dict['net_efficiency'] iter_eff.append(aiq_dict) avg_train_losses.append(train_loss.tolist()) avg_valid_losses.append(valid_loss.tolist()) avg_precision_scores.append(val_avg_precision) # Print epoch statistics print('[epoch %d] train batch loss: %.7f' % (epoch + 1, train_loss)) print('[epoch %d] val batch loss: %.7f' % (epoch + 1, valid_loss)) print('[epoch %d] val ROC AUC Score: %.7f' % (epoch + 1, val_roc_auc_score)) print('[epoch %d] val Avg Precision Score: %.7f' % (epoch + 1, val_avg_precision)) if options.efficiency_calc: print('[epoch %d] Model Efficiency: %.7f' % (epoch + 1, epoch_eff)) print('[epoch %d] aIQ Calc Time: %.7f seconds' % (epoch + 1, aiq_time)) for layer in aiq_dict["layer_metrics"]: print('[epoch %d]\t Layer %s Efficiency: %.7f' % (epoch + 1, layer, aiq_dict['layer_metrics'][layer]['efficiency'])) # Check if we need to early stop early_stopping(valid_loss, model) if early_stopping.early_stop: print("Early stopping") estop = True epoch_counter -= options.patience break # Load last/best checkpoint model saved via earlystopping model.load_state_dict(torch.load('checkpoint.pt')) # Time for plots now = datetime.now() time = now.strftime("%d-%m-%Y_%H-%M-%S") # Plot & save losses for this iteration loss_plt = plt.figure() loss_ax = loss_plt.add_subplot() loss_ax.plot(range(1, len(avg_train_losses) + 1), avg_train_losses, label='Training Loss') loss_ax.plot(range(1, len(avg_valid_losses) + 1), avg_valid_losses, label='Validation Loss') # find position of lowest validation loss if estop: minposs = avg_valid_losses.index(min(avg_valid_losses)) else: minposs = options.epochs # update our epoch counter to represent where the model actually stopped training epoch_counter -= ((len(avg_valid_losses)) - minposs) nbits = model.weight_precision if hasattr(model, 'weight_precision') else 32 # Plot losses for this iter loss_ax.axvline(minposs, linestyle='--', color='r', label='Early Stopping Checkpoint') loss_ax.set_xlabel('epochs') loss_ax.set_ylabel('loss') loss_ax.grid(True) loss_ax.legend() filename = 'loss_plot_{}b_e{}_{}_.png'.format(nbits, epoch_counter, str(dims_str)) loss_ax.set_title('Loss from epoch 1 to {}, {}b model'.format( epoch_counter, nbits)) loss_plt.savefig(path.join(options.outputDir, filename), bbox_inches='tight') loss_plt.show() plt.close(loss_plt) if options.efficiency_calc: # Plot & save eff for this iteration loss_plt = plt.figure() loss_ax = loss_plt.add_subplot() loss_ax.set_title('Net Eff. from epoch 0 to {}, {}b {} model'.format( epoch_counter, nbits, dims_str)) loss_ax.plot(range(0, len(iter_eff)), [z['net_efficiency'] for z in iter_eff], label='Net Efficiency', color='green') # loss_ax.plot(range(1, len(iter_eff) + 1), [z["layer_metrics"][layer]['efficiency'] for z in iter_eff]) loss_ax.axvline(minposs, linestyle='--', color='r', label='Early Stopping Checkpoint') loss_ax.set_xlabel('epochs') loss_ax.set_ylabel('Net Efficiency') loss_ax.grid(True) loss_ax.legend() filename = 'eff_plot_{}b_e{}_{}_.png'.format(nbits, epoch_counter, dims_str) loss_plt.savefig(path.join(options.outputDir, filename), bbox_inches='tight') loss_plt.show() plt.close(loss_plt) model_filename = "BO_{}b_{}.pth".format(nbits, dims_str) model_filename2 = "BO_{}b_{}_full.pth".format(nbits, dims_str) os.makedirs(path.join(options.outputDir, "full_models"), exist_ok=True) torch.save(model.state_dict(), path.join(options.outputDir, model_filename)) #torch.save(model,path.join(options.outputDir,"full_models", model_filename2)) final_aiq = calc_AiQ(model, eval_loader, batnorm=True, device=device, full_results=True, testlabels=test_dataset.labels_list) model_totalloss_json_dict = { options.bits: [[avg_train_losses, avg_valid_losses], iter_eff, [minposs]] } filename = 'model_losses_{}_{}.json'.format(options.bits, dims_str) with open(path.join(options.outputDir, filename), 'w') as fp: json.dump(model_totalloss_json_dict, fp) final_aiq.update({'dims': str(dims_str), 'best': best}) aiq_entry = {int(calc_BOPS(model)): final_aiq} return aiq_entry
plt.close(loss_plt) model_filename = "BO_{}b_best_{}.pth".format(nbits, time) torch.save(model.state_dict(), path.join(options.outputDir, model_filename)) final_aiq = calc_AiQ(model, test_loader, batnorm=True, device=device, full_results=True, testlabels=test_dataset.labels_list) model_totalloss_json_dict = { options.bits: [[avg_train_losses, avg_valid_losses], iter_eff, [minposs]] } filename = 'model_losses_{}_{}.json'.format(options.size, options.bits) with open(path.join(options.outputDir, filename), 'w') as fp: json.dump(model_totalloss_json_dict, fp) filename = 'model_AIQ_{}_{}.json'.format(options.size, options.bits) with open(path.join(options.outputDir, filename), 'w') as fp: json.dump( { '{}b'.format(options.bits): { int(calc_BOPS(model)): final_aiq, 'dims': str(model_size) } }, fp)