def optimize_quantization_points(modelToQuantize, train_loader, test_loader, initial_learning_rate=1e-5, initial_momentum=0.9, epochs_to_train=30, print_every=500, use_nesterov=True, learning_rate_style='generic', numPointsPerTensor=16, assignBitsAutomatically=False, bucket_size=None, use_distillation_loss=True, initialize_method='quantiles', quantize_first_and_last_layer=True): print('Preparing training - pre processing tensors') numTensorsNetwork = sum(1 for _ in modelToQuantize.parameters()) initialize_method = initialize_method.lower() if initialize_method not in ('quantiles', 'uniform'): raise ValueError( 'The initialization method must be either quantiles or uniform') if isinstance(numPointsPerTensor, int): numPointsPerTensor = [numPointsPerTensor] * numTensorsNetwork if len(numPointsPerTensor) != numTensorsNetwork: raise ValueError( 'numPointsPerTensor must be equal to the number of tensor in the network' ) if quantize_first_and_last_layer is False: numPointsPerTensor = numPointsPerTensor[1:-1] #same scaling function that is used inside nonUniformQUantization. It is important they are the same scalingFunction = quantization.ScalingFunction('linear', False, False, bucket_size, False) #if assigning bits automatically, use the 2-norm of the gradient to determine weights importance if assignBitsAutomatically: num_to_estimate_grad = 5 modelToQuantize.zero_grad() for idx_minibatch, batch in enumerate(train_loader, start=1): cnn_hf.forward_and_backward(modelToQuantize, batch, idx_batch=idx_minibatch, epoch=0, use_distillation_loss=False) if idx_minibatch >= num_to_estimate_grad: break #now we compute the 2-norm of the gradient for each parameter fisherInformation = [] for idx, p in enumerate(modelToQuantize.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == numTensorsNetwork - 1: continue fisherInformation.append( (p.grad.data / num_to_estimate_grad).norm()) #zero the grad we computed modelToQuantize.zero_grad() #now we use a simple linear proportion to assign bits #the minimum number of points is half what was given as input numPointsPerTensor = quantization.help_functions.assign_bits_automatically( fisherInformation, numPointsPerTensor, input_is_point=True) #initialize the points using the percentile function so as to make them all usable pointsPerTensor = [] if initialize_method == 'quantiles': for idx, p in enumerate(modelToQuantize.parameters()): if quantize_first_and_last_layer is True: currPointsPerTensor = numPointsPerTensor[idx] else: if idx == 0 or idx == numTensorsNetwork - 1: continue currPointsPerTensor = numPointsPerTensor[idx - 1] initial_points = quantization.help_functions.initialize_quantization_points( p.data, scalingFunction, currPointsPerTensor) initial_points = Variable(initial_points, requires_grad=True) # do a dummy backprop so that the grad attribute is initialized. We need this because we call # the .backward() function manually later on (since pytorch can't assign variables to model # parameters) initial_points.sum().backward() pointsPerTensor.append(initial_points) elif initialize_method == 'uniform': for numPoint in numPointsPerTensor: initial_points = torch.FloatTensor( [x / (numPoint - 1) for x in range(numPoint)]) if USE_CUDA: initial_points = initial_points.cuda() initial_points = Variable(initial_points, requires_grad=True) # do a dummy backprop so that the grad attribute is initialized. We need this because we call # the .backward() function manually later on (since pytorch can't assign variables to model # parameters) initial_points.sum().backward() pointsPerTensor.append(initial_points) else: raise ValueError #dealing with 0 momentum options_optimizer = {} if initial_momentum != 0: options_optimizer = { 'momentum': initial_momentum, 'nesterov': use_nesterov } optimizer = optim.SGD(pointsPerTensor, lr=initial_learning_rate, **options_optimizer) lr_scheduler = cnn_hf.LearningRateScheduler(initial_learning_rate, learning_rate_style) startTime = time.time() pred_accuracy_epochs = [] losses_epochs = [] last_loss_saved = float('inf') number_minibatches_per_epoch = len(train_loader) if print_every > number_minibatches_per_epoch: print_every = number_minibatches_per_epoch // 2 modelToQuantize.eval() quantizedModel = copy.deepcopy(modelToQuantize) epoch = 0 quantizationFunctions = [] for idx, p in enumerate(quantizedModel.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == numTensorsNetwork - 1: continue #efficient version of nonUniformQuantization quant_fun = quantization.nonUniformQuantization_variable( max_element=False, subtract_mean=False, modify_in_place=False, bucket_size=bucket_size, pre_process_tensors=True, tensor=p.data) quantizationFunctions.append(quant_fun) print('Pre processing done, training started') for epoch in range(epochs_to_train): quantizedModel.train() print_loss_total = 0 for idx_minibatch, data in enumerate(train_loader, start=1): #zero the gradient of the parameters model quantizedModel.zero_grad() optimizer.zero_grad() #quantize the model parameters for idx, p_quantized in enumerate(quantizedModel.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == numTensorsNetwork - 1: continue currIdx = idx - 1 else: currIdx = idx #efficient quantization p_quantized.data = quantizationFunctions[currIdx].forward( None, pointsPerTensor[currIdx].data) print_loss = cnn_hf.forward_and_backward( quantizedModel, data, idx_minibatch, epoch, use_distillation_loss=use_distillation_loss, teacher_model=modelToQuantize) #now get the gradient of the pointsPerTensor for idx, p in enumerate(quantizedModel.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == numTensorsNetwork - 1: continue currIdx = idx - 1 else: currIdx = idx pointsPerTensor[currIdx].grad.data = quantizationFunctions[ currIdx].backward(p.grad.data)[1] optimizer.step() #after optimzer.step() we need to make sure that the points are still sorted. Implementation detail for points in pointsPerTensor: points.data = torch.sort(points.data)[0] # print statistics print_loss_total += print_loss if (idx_minibatch) % print_every == 0: last_loss_saved = print_loss_total / print_every str_to_print = 'Time Elapsed: {}, [Epoch: {}, Minibatch: {}], loss: {:3f}'.format( mhf.timeSince(startTime), epoch + 1, idx_minibatch, last_loss_saved) if pred_accuracy_epochs: str_to_print += '. Last prediction accuracy: {:2f}%'.format( pred_accuracy_epochs[-1] * 100) print(str_to_print) print_loss_total = 0 losses_epochs.append(last_loss_saved) curr_pred_accuracy = cnn_hf.evaluateModel(quantizedModel, test_loader, fastEvaluation=False) pred_accuracy_epochs.append(curr_pred_accuracy) print(' === Epoch: {} - prediction accuracy {:2f}% === '.format( epoch + 1, curr_pred_accuracy * 100)) # updating the learning rate new_learning_rate, stop_training = lr_scheduler.update_learning_rate( epoch, 1 - curr_pred_accuracy) if stop_training is True: break for p in optimizer.param_groups: try: p['lr'] = new_learning_rate except: pass print('Finished Training in {} epochs'.format(epoch + 1)) informationDict = { 'predictionAccuracy': pred_accuracy_epochs, 'numEpochsTrained': epoch + 1, 'lossSaved': losses_epochs } #IMPORTANT: When there are batch normalization layers, important information is contained #also in the running mean and runnin var values of the batch normalization layers. Since these are not #parameters, they don't show up in model.parameter() list (and they don't have quantization points #associated with it). So if I return just the optimized quantization points, and quantize the model #weight with them, I will have inferior performance because the running mean and var of the batch normalization #layers won't be saved. To solve this issue I also return the quantized model state dict, that contains #not only the parameter of the models but also this statistics for the batch normalization layers return quantizedModel.state_dict(), pointsPerTensor, informationDict
if USE_CUDA: quant_distilled_model = quant_distilled_model.cuda() if NUM_GPUS > 1: quant_distilled_model = torch.nn.parallel.DataParallel(quant_distilled_model) if not quant_distilled_model_name in imagenet_manager.saved_models: imagenet_manager.add_new_model(quant_distilled_model_name, quantDistilledModelPath, arguments_creator_function=quantDistilledOptions) if TRAIN_QUANTIZED_DISTILLED: imagenet_manager.train_model(quant_distilled_model, model_name=quant_distilled_model_name, train_function=convForwModel.train_model, arguments_train_function={'epochs_to_train': epochsToTrainImageNet, 'learning_rate_style': 'imagenet', 'initial_learning_rate': 0.1, 'use_nesterov':True, 'initial_momentum':0.9, 'weight_decayL2':1e-4, 'start_epoch': 0, 'print_every':30, 'use_distillation_loss':True, 'teacher_model': alexnet_unquantized, 'quantizeWeights':True, 'numBits':NUM_BITS, 'bucket_size':256, 'quantize_first_and_last_layer': False}, train_loader=train_loader, test_loader=test_loader) quant_distilled_model.load_state_dict(imagenet_manager.load_model_state_dict(quant_distilled_model_name)) print(cnn_hf.evaluateModel(quant_distilled_model))
def train_model(model, train_loader, test_loader, initial_learning_rate=0.001, use_nesterov=True, initial_momentum=0.9, weight_decayL2=0.00022, epochs_to_train=100, print_every=500, learning_rate_style='generic', use_distillation_loss=False, teacher_model=None, quantizeWeights=False, numBits=8, grad_clipping_threshold=False, start_epoch=0, bucket_size=None, quantizationFunctionToUse='uniformLinearScaling', backprop_quantization_style='none', estimate_quant_grad_every=1, add_gradient_noise=False, ask_teacher_strategy=('always', None), quantize_first_and_last_layer=True, mix_with_differentiable_quantization=False): # backprop_quantization_style determines how to modify the gradients to take into account the # quantization function. Specifically, one can use 'none', where gradients are not modified, # 'truncated', where gradient values outside -1 and 1 are truncated to 0 (as per the paper # specified in the comments) and 'complicated', which is the temp name for my idea which is slow and complicated # to compute if use_distillation_loss is True and teacher_model is None: raise ValueError( 'To compute distillation loss you have to pass the teacher model') if teacher_model is not None: teacher_model.eval() learning_rate_style = learning_rate_style.lower() lr_scheduler = cnn_hf.LearningRateScheduler(initial_learning_rate, learning_rate_style) new_learning_rate = initial_learning_rate optimizer = optim.SGD(model.parameters(), lr=initial_learning_rate, nesterov=use_nesterov, momentum=initial_momentum, weight_decay=weight_decayL2) startTime = time.time() pred_accuracy_epochs = [] percentages_asked_teacher = [] losses_epochs = [] informationDict = {} last_loss_saved = float('inf') step_since_last_grad_quant_estimation = 1 number_minibatches_per_epoch = len(train_loader) if quantizeWeights: quantizationFunctionToUse = quantizationFunctionToUse.lower() if backprop_quantization_style is None: backprop_quantization_style = 'none' backprop_quantization_style = backprop_quantization_style.lower() if quantizationFunctionToUse == 'uniformAbsMaxScaling'.lower(): s = 2**(numBits - 1) type_of_scaling = 'absmax' elif quantizationFunctionToUse == 'uniformLinearScaling'.lower(): s = 2**numBits type_of_scaling = 'linear' else: raise ValueError( 'The specified quantization function is not present') if backprop_quantization_style is None or backprop_quantization_style in ( 'none', 'truncated'): quantizeFunctions = lambda x: quantization.uniformQuantization( x, s, type_of_scaling=type_of_scaling, stochastic_rounding=False, max_element=False, subtract_mean=False, modify_in_place=False, bucket_size=bucket_size)[0] elif backprop_quantization_style == 'complicated': quantizeFunctions = [quantization.uniformQuantization_variable(s, type_of_scaling=type_of_scaling, stochastic_rounding=False, max_element=False, subtract_mean=False, modify_in_place=False, bucket_size=bucket_size) \ for _ in model.parameters()] else: raise ValueError( 'The specified backprop_quantization_style not recognized') num_parameters = sum(1 for _ in model.parameters()) def quantize_weights_model(model): for idx, p in enumerate(model.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == num_parameters - 1: continue #don't quantize first and last layer if backprop_quantization_style == 'truncated': p.data.clamp_(-1, 1) if backprop_quantization_style in ('none', 'truncated'): p.data = quantizeFunctions(p.data) elif backprop_quantization_style == 'complicated': p.data = quantizeFunctions[idx].forward(p.data) else: raise ValueError def backward_quant_weights_model(model): if backprop_quantization_style == 'none': return for idx, p in enumerate(model.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == num_parameters - 1: continue #don't quantize first and last layer # Now some sort of backward. For the none style, we don't do anything. # for the truncated style, we just need to truncate the grad weights # as per the paper here: https://arxiv.org/pdf/1609.07061.pdf # if we are quantizing, I put gradient values above 1 to 0. # their case it not immediately applicable to ours, but let's try this out if backprop_quantization_style == 'truncated': p.grad.data[p.data.abs() > 1] = 0 elif backprop_quantization_style == 'complicated': p.grad.data = quantizeFunctions[idx].backward(p.grad.data) if print_every > number_minibatches_per_epoch: print_every = number_minibatches_per_epoch // 2 try: epoch = start_epoch for epoch in range(start_epoch, epochs_to_train + start_epoch): print("begin training") if USE_CUDA: print("USE_CUDA") if mix_with_differentiable_quantization: print('=== Starting Quantized Distillation epoch === ') model.train() print_loss_total = 0 count_asked_teacher = 0 count_asked_total = 0 for idx_minibatch, data in enumerate(train_loader, start=1): if quantizeWeights: if step_since_last_grad_quant_estimation >= estimate_quant_grad_every: # we save them because we only want to quantize weights to compute gradients, # but keep using non-quantized weights during the algorithm model_state_dict = model.state_dict() quantize_weights_model(model) model.zero_grad() print_loss, curr_c_teach, curr_c_total = cnn_hf.forward_and_backward( model, data, idx_minibatch, epoch, use_distillation_loss=use_distillation_loss, teacher_model=teacher_model, ask_teacher_strategy=ask_teacher_strategy, return_more_info=True) count_asked_teacher += curr_c_teach count_asked_total += curr_c_total #load the non-quantize weights and use them for the update. The quantized #weights are used only to get the quantized gradient if quantizeWeights: if step_since_last_grad_quant_estimation >= estimate_quant_grad_every: model.load_state_dict(model_state_dict) del model_state_dict #free memory if add_gradient_noise and not quantizeWeights: cnn_hf.add_gradient_noise(model, idx_minibatch, epoch, number_minibatches_per_epoch) if grad_clipping_threshold is not False: # gradient clipping for p in model.parameters(): p.grad.data.clamp_(-grad_clipping_threshold, grad_clipping_threshold) if quantizeWeights: if step_since_last_grad_quant_estimation >= estimate_quant_grad_every: backward_quant_weights_model(model) optimizer.step() if step_since_last_grad_quant_estimation >= estimate_quant_grad_every: step_since_last_grad_quant_estimation = 0 step_since_last_grad_quant_estimation += 1 # print statistics print_loss_total += print_loss if (idx_minibatch) % print_every == 0: last_loss_saved = print_loss_total / print_every str_to_print = 'Time Elapsed: {}, [Start Epoch: {}, Epoch: {}, Minibatch: {}], loss: {:3f}'.format( mhf.timeSince(startTime), start_epoch + 1, epoch + 1, idx_minibatch, last_loss_saved) if pred_accuracy_epochs: str_to_print += ' Last prediction accuracy: {:2f}%'.format( pred_accuracy_epochs[-1] * 100) print(str_to_print) print_loss_total = 0 curr_percentages_asked_teacher = count_asked_teacher / count_asked_total if count_asked_total != 0 else 0 percentages_asked_teacher.append(curr_percentages_asked_teacher) losses_epochs.append(last_loss_saved) curr_pred_accuracy = cnn_hf.evaluateModel(model, test_loader, fastEvaluation=False) pred_accuracy_epochs.append(curr_pred_accuracy) print(' === Epoch: {} - prediction accuracy {:2f}% === '.format( epoch + 1, curr_pred_accuracy * 100)) if mix_with_differentiable_quantization and epoch != start_epoch + epochs_to_train - 1: print('=== Starting Differentiable Quantization epoch === ') #the diff quant step is not done at the last epoch, so we end on a quantized distillation epoch model_state_dict = optimize_quantization_points( model, train_loader, test_loader, new_learning_rate, initial_momentum=initial_momentum, epochs_to_train=1, print_every=print_every, use_nesterov=use_nesterov, learning_rate_style=learning_rate_style, numPointsPerTensor=2**numBits, assignBitsAutomatically=True, bucket_size=bucket_size, use_distillation_loss=True, initialize_method='quantiles', quantize_first_and_last_layer=quantize_first_and_last_layer )[0] model.load_state_dict(model_state_dict) del model_state_dict # free memory losses_epochs.append(last_loss_saved) curr_pred_accuracy = cnn_hf.evaluateModel(model, test_loader, fastEvaluation=False) pred_accuracy_epochs.append(curr_pred_accuracy) print( ' === Epoch: {} - prediction accuracy {:2f}% === '.format( epoch + 1, curr_pred_accuracy * 100)) #updating the learning rate new_learning_rate, stop_training = lr_scheduler.update_learning_rate( epoch, 1 - curr_pred_accuracy) if stop_training is True: break for p in optimizer.param_groups: try: p['lr'] = new_learning_rate except: pass except Exception as e: print( 'An exception occurred: {}\n. Training has been stopped after {} epochs.' .format(e, epoch)) informationDict['errorFlag'] = True informationDict['numEpochsTrained'] = epoch - start_epoch return model, informationDict except KeyboardInterrupt: print('User stopped training after {} epochs'.format(epoch)) informationDict['errorFlag'] = False informationDict['numEpochsTrained'] = epoch - start_epoch else: print('Finished Training in {} epochs'.format(epoch + 1)) informationDict['errorFlag'] = False informationDict['numEpochsTrained'] = epoch + 1 - start_epoch if quantizeWeights: quantize_weights_model(model) if mix_with_differentiable_quantization: informationDict['numEpochsTrained'] *= 2 informationDict['percentages_asked_teacher'] = percentages_asked_teacher informationDict['predictionAccuracy'] = pred_accuracy_epochs informationDict['lossSaved'] = losses_epochs return model, informationDict
teacherModel = convForwModel.ConvolForwardNet(**convForwModel.teacherModelSpec, useBatchNorm=USE_BATCH_NORM, useAffineTransformInBatchNorm=AFFINE_BATCH_NORM) if USE_CUDA: teacherModel = teacherModel.cuda() if not model_name in cifar10Manager.saved_models: cifar10Manager.add_new_model(model_name, teacherModelPath, arguments_creator_function={**convForwModel.teacherModelSpec, 'useBatchNorm':USE_BATCH_NORM, 'useAffineTransformInBatchNorm':AFFINE_BATCH_NORM}) if TRAIN_TEACHER_MODEL: cifar10Manager.train_model(teacherModel, model_name=model_name, train_function=convForwModel.train_model, arguments_train_function={'epochs_to_train': epochsToTrainCIFAR}, train_loader=train_loader, test_loader=test_loader) teacherModel.load_state_dict(cifar10Manager.load_model_state_dict(model_name)) cnn_hf.evaluateModel(teacherModel, test_loader, k=5) #Define the architechtures we want to try smallerModelSpec0 = {'spec_conv_layers': [(75, 5, 5), (50, 5, 5), (50, 5, 5), (25, 5, 5)], 'spec_max_pooling': [(1, 2, 2), (3, 2, 2)], 'spec_dropout_rates': [(1, 0.2), (3, 0.3), (4, 0.4)], 'spec_linear': [500], 'width': 32, 'height': 32} smallerModelSpec1 = {'spec_conv_layers': [(50, 5, 5), (25, 5, 5), (25, 5, 5), (10, 5, 5)], 'spec_max_pooling': [(1, 2, 2), (3, 2, 2)], 'spec_dropout_rates': [(1, 0.2), (3, 0.3), (4, 0.4)], 'spec_linear': [400], 'width': 32, 'height': 32} smallerModelSpec2 = {'spec_conv_layers': [(25, 5, 5), (10, 5, 5), (10, 5, 5), (5, 5, 5)], 'spec_max_pooling': [(1, 2, 2), (3, 2, 2)], 'spec_dropout_rates': [(1, 0.2), (3, 0.3), (4, 0.4)], 'spec_linear': [300], 'width': 32, 'height': 32}
if compute_initial_points is True: compute_initial_points = 'quantiles' else: compute_initial_points = 'uniform' str_identifier = 'quantpoints{}bits_auto{}_distill{}_initial"{}"'.format( numBit, assign_bits_auto, use_distillation_loss, compute_initial_points) distilled_quantized_model_name = distilled_model_name + str_identifier save_path = cifar10Manager.get_model_base_path( distilled_model_name) + str_identifier with open(save_path, 'rb') as p: quantization_points, infoDict = pickle.load(p) distilled_quantized_model = convForwModel.ConvolForwardNet( **distilledModelSpec, useBatchNorm=USE_BATCH_NORM, useAffineTransformInBatchNorm=AFFINE_BATCH_NORM) if USE_CUDA: distilled_quantized_model = distilled_quantized_model.cuda() distilled_quantized_model.load_state_dict( torch.load(save_path + '_model_state_dict')) reported_accuracy = max(infoDict['predictionAccuracy']) actual_accuracy = cnn_hf.evaluateModel( distilled_quantized_model, test_loader) #this corresponds to the last one #the only problem is that I don't save the model with the max accuracy, but the model at the last epoch print( 'Model "{}" => reported accuracy: {} - actual accuracy: {}'.format( distilled_quantized_model_name, reported_accuracy, actual_accuracy))
imagenet_manager.add_new_model(quant_distilled_model_name, quantDistilledModelPath, arguments_creator_function=quantDistilledOptions) if TRAIN_QUANTIZED_DISTILLED: imagenet_manager.train_model(quant_distilled_model, model_name=quant_distilled_model_name, train_function=convForwModel.train_model, arguments_train_function={'epochs_to_train': epochsToTrainImageNet, 'learning_rate_style': 'imagenet', 'initial_learning_rate': 0.1, 'use_nesterov':True, 'initial_momentum':0.9, 'weight_decayL2':1e-4, 'start_epoch': 0, 'print_every':30, 'use_distillation_loss':True, 'teacher_model': alexnet_unquantized, 'quantizeWeights':True, 'numBits':NUM_BITS, 'bucket_size':256, 'quantize_first_and_last_layer': False}, train_loader=train_loader, test_loader=test_loader) quant_distilled_model.load_state_dict(imagenet_manager.load_model_state_dict(quant_distilled_model_name)) print(cnn_hf.evaluateModel(quant_distilled_model, test_loader, fastEvaluation=False)) print(cnn_hf.evaluateModel(quant_distilled_model, test_loader, fastEvaluation=False, k=5)) print(cnn_hf.evaluateModel(alexnet_unquantized, test_loader, fastEvaluation=False)) print(cnn_hf.evaluateModel(alexnet_unquantized, test_loader, fastEvaluation=False, k=5)) quant_fun = functools.partial(quantization.uniformQuantization, s=2**4, bucket_size=256) size_mb = mhf.get_size_quantized_model(quant_distilled_model, 4, quant_fun, 256, quantizeFirstLastLayer=False) print(size_mb)
train_function=convForwModel.train_model, arguments_train_function={'epochs_to_train': epochsToTrainCIFAR}, train_loader=train_loader, test_loader=test_loader) # else: # cifar10Manager.train_model(teacherModel, model_name=model_name, # train_function=convForwModel.train_model, # continue_training_from =1, # arguments_train_function={'epochs_to_train': epochsToTrainCIFAR}, # train_loader=train_loader, test_loader=test_loader) print("Teacher Model Training complete") print("Eval Teacher model") print(model_name) teacherModel.load_state_dict(cifar10Manager.load_model_state_dict(model_name)) acc = cnn_hf.evaluateModel(teacherModel, test_loader, k=1) print("Top-1 eval acc is {}".format(acc)) smallerModelSpec2 = {'spec_conv_layers': [(25, 5, 5), (10, 5, 5), (10, 5, 5), (5, 5, 5)], 'spec_max_pooling': [(1, 2, 2), (3, 2, 2)], 'spec_dropout_rates': [(1, 0.2), (3, 0.3), (4, 0.4)], 'spec_linear': [300], 'width': 32, 'height': 32} small_model_name = 'cifar10_smaller_spec2' model_small_spec = copy.deepcopy(smallerModelSpec2) smallerModelPath = os.path.join(model_save_path, small_model_name) smallerModel = convForwModel.ConvolForwardNet(**model_small_spec, useBatchNorm=True, useAffineTransformInBatchNorm=True) if not small_model_name in cifar10Manager.saved_models:
**smallerModelSpecs[args.stModel], activation=args.stud_act, numBins=args.num_bins, useBatchNorm=USE_BATCH_NORM, useAffineTransformInBatchNorm=AFFINE_BATCH_NORM) else: model = convForwModel.ConvolForwardNet( **convForwModel.teacherModelSpec, useBatchNorm=USE_BATCH_NORM, useAffineTransformInBatchNorm=AFFINE_BATCH_NORM) if USE_CUDA: model = model.cuda() test_loader = data.getTestLoader(1) import time start = time.time() cnn_hf.evaluateModel(model, test_loader) mem = torch.cuda.max_memory_allocated() end = time.time() avg_time = (end - start) * 1000 / len(test_loader) if args.train_teacher: str2save = 'teacher_cifar10: time: {} ms, memory: {} M'.format( avg_time, mem / (1024**2)) print(str2save) else: str2save = 's_{}_{}_nb_{}cifar10: time: {} ms, memory: {} M'.format( args.stModel, args.stud_act, args.num_bins, avg_time, mem / (1024**2)) print(str2save) with open('memory.txt', 'a') as fr: fr.write(str2save + '\n') torch.cuda.empty_cache()