def quantizeFunctions(x): return quantization.uniformQuantization(x, s, type_of_scaling=type_of_scaling, stochastic_rounding=False, max_element=False, subtract_mean=False, modify_in_place=False, bucket_size=bucket_size)[0] elif backprop_quantization_style == 'complicated':
def train_model(model, train_loader, test_loader, initial_learning_rate=0.001, use_nesterov=True, initial_momentum=0.9, weight_decayL2=0.00022, epochs_to_train=100, print_every=500, learning_rate_style='generic', use_distillation_loss=False, teacher_model=None, quantizeWeights=False, numBits=8, grad_clipping_threshold=False, start_epoch=0, bucket_size=None, quantizationFunctionToUse='uniformLinearScaling', backprop_quantization_style='none', estimate_quant_grad_every=1, add_gradient_noise=False, ask_teacher_strategy=('always', None), quantize_first_and_last_layer=True, mix_with_differentiable_quantization=False): # backprop_quantization_style determines how to modify the gradients to take into account the # quantization function. Specifically, one can use 'none', where gradients are not modified, # 'truncated', where gradient values outside -1 and 1 are truncated to 0 (as per the paper # specified in the comments) and 'complicated', which is the temp name for my idea which is slow and complicated # to compute if use_distillation_loss is True and teacher_model is None: raise ValueError( 'To compute distillation loss you have to pass the teacher model') if teacher_model is not None: teacher_model.eval() learning_rate_style = learning_rate_style.lower() lr_scheduler = cnn_hf.LearningRateScheduler(initial_learning_rate, learning_rate_style) new_learning_rate = initial_learning_rate optimizer = optim.SGD(model.parameters(), lr=initial_learning_rate, nesterov=use_nesterov, momentum=initial_momentum, weight_decay=weight_decayL2) startTime = time.time() pred_accuracy_epochs = [] percentages_asked_teacher = [] losses_epochs = [] informationDict = {} last_loss_saved = float('inf') step_since_last_grad_quant_estimation = 1 number_minibatches_per_epoch = len(train_loader) if quantizeWeights: quantizationFunctionToUse = quantizationFunctionToUse.lower() if backprop_quantization_style is None: backprop_quantization_style = 'none' backprop_quantization_style = backprop_quantization_style.lower() if quantizationFunctionToUse == 'uniformAbsMaxScaling'.lower(): s = 2**(numBits - 1) type_of_scaling = 'absmax' elif quantizationFunctionToUse == 'uniformLinearScaling'.lower(): s = 2**numBits type_of_scaling = 'linear' else: raise ValueError( 'The specified quantization function is not present') if backprop_quantization_style is None or backprop_quantization_style in ( 'none', 'truncated'): quantizeFunctions = lambda x: quantization.uniformQuantization( x, s, type_of_scaling=type_of_scaling, stochastic_rounding=False, max_element=False, subtract_mean=False, modify_in_place=False, bucket_size=bucket_size)[0] elif backprop_quantization_style == 'complicated': quantizeFunctions = [quantization.uniformQuantization_variable(s, type_of_scaling=type_of_scaling, stochastic_rounding=False, max_element=False, subtract_mean=False, modify_in_place=False, bucket_size=bucket_size) \ for _ in model.parameters()] else: raise ValueError( 'The specified backprop_quantization_style not recognized') num_parameters = sum(1 for _ in model.parameters()) def quantize_weights_model(model): for idx, p in enumerate(model.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == num_parameters - 1: continue #don't quantize first and last layer if backprop_quantization_style == 'truncated': p.data.clamp_(-1, 1) if backprop_quantization_style in ('none', 'truncated'): p.data = quantizeFunctions(p.data) elif backprop_quantization_style == 'complicated': p.data = quantizeFunctions[idx].forward(p.data) else: raise ValueError def backward_quant_weights_model(model): if backprop_quantization_style == 'none': return for idx, p in enumerate(model.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == num_parameters - 1: continue #don't quantize first and last layer # Now some sort of backward. For the none style, we don't do anything. # for the truncated style, we just need to truncate the grad weights # as per the paper here: https://arxiv.org/pdf/1609.07061.pdf # if we are quantizing, I put gradient values above 1 to 0. # their case it not immediately applicable to ours, but let's try this out if backprop_quantization_style == 'truncated': p.grad.data[p.data.abs() > 1] = 0 elif backprop_quantization_style == 'complicated': p.grad.data = quantizeFunctions[idx].backward(p.grad.data) if print_every > number_minibatches_per_epoch: print_every = number_minibatches_per_epoch // 2 try: epoch = start_epoch for epoch in range(start_epoch, epochs_to_train + start_epoch): print("begin training") if USE_CUDA: print("USE_CUDA") if mix_with_differentiable_quantization: print('=== Starting Quantized Distillation epoch === ') model.train() print_loss_total = 0 count_asked_teacher = 0 count_asked_total = 0 for idx_minibatch, data in enumerate(train_loader, start=1): if quantizeWeights: if step_since_last_grad_quant_estimation >= estimate_quant_grad_every: # we save them because we only want to quantize weights to compute gradients, # but keep using non-quantized weights during the algorithm model_state_dict = model.state_dict() quantize_weights_model(model) model.zero_grad() print_loss, curr_c_teach, curr_c_total = cnn_hf.forward_and_backward( model, data, idx_minibatch, epoch, use_distillation_loss=use_distillation_loss, teacher_model=teacher_model, ask_teacher_strategy=ask_teacher_strategy, return_more_info=True) count_asked_teacher += curr_c_teach count_asked_total += curr_c_total #load the non-quantize weights and use them for the update. The quantized #weights are used only to get the quantized gradient if quantizeWeights: if step_since_last_grad_quant_estimation >= estimate_quant_grad_every: model.load_state_dict(model_state_dict) del model_state_dict #free memory if add_gradient_noise and not quantizeWeights: cnn_hf.add_gradient_noise(model, idx_minibatch, epoch, number_minibatches_per_epoch) if grad_clipping_threshold is not False: # gradient clipping for p in model.parameters(): p.grad.data.clamp_(-grad_clipping_threshold, grad_clipping_threshold) if quantizeWeights: if step_since_last_grad_quant_estimation >= estimate_quant_grad_every: backward_quant_weights_model(model) optimizer.step() if step_since_last_grad_quant_estimation >= estimate_quant_grad_every: step_since_last_grad_quant_estimation = 0 step_since_last_grad_quant_estimation += 1 # print statistics print_loss_total += print_loss if (idx_minibatch) % print_every == 0: last_loss_saved = print_loss_total / print_every str_to_print = 'Time Elapsed: {}, [Start Epoch: {}, Epoch: {}, Minibatch: {}], loss: {:3f}'.format( mhf.timeSince(startTime), start_epoch + 1, epoch + 1, idx_minibatch, last_loss_saved) if pred_accuracy_epochs: str_to_print += ' Last prediction accuracy: {:2f}%'.format( pred_accuracy_epochs[-1] * 100) print(str_to_print) print_loss_total = 0 curr_percentages_asked_teacher = count_asked_teacher / count_asked_total if count_asked_total != 0 else 0 percentages_asked_teacher.append(curr_percentages_asked_teacher) losses_epochs.append(last_loss_saved) curr_pred_accuracy = cnn_hf.evaluateModel(model, test_loader, fastEvaluation=False) pred_accuracy_epochs.append(curr_pred_accuracy) print(' === Epoch: {} - prediction accuracy {:2f}% === '.format( epoch + 1, curr_pred_accuracy * 100)) if mix_with_differentiable_quantization and epoch != start_epoch + epochs_to_train - 1: print('=== Starting Differentiable Quantization epoch === ') #the diff quant step is not done at the last epoch, so we end on a quantized distillation epoch model_state_dict = optimize_quantization_points( model, train_loader, test_loader, new_learning_rate, initial_momentum=initial_momentum, epochs_to_train=1, print_every=print_every, use_nesterov=use_nesterov, learning_rate_style=learning_rate_style, numPointsPerTensor=2**numBits, assignBitsAutomatically=True, bucket_size=bucket_size, use_distillation_loss=True, initialize_method='quantiles', quantize_first_and_last_layer=quantize_first_and_last_layer )[0] model.load_state_dict(model_state_dict) del model_state_dict # free memory losses_epochs.append(last_loss_saved) curr_pred_accuracy = cnn_hf.evaluateModel(model, test_loader, fastEvaluation=False) pred_accuracy_epochs.append(curr_pred_accuracy) print( ' === Epoch: {} - prediction accuracy {:2f}% === '.format( epoch + 1, curr_pred_accuracy * 100)) #updating the learning rate new_learning_rate, stop_training = lr_scheduler.update_learning_rate( epoch, 1 - curr_pred_accuracy) if stop_training is True: break for p in optimizer.param_groups: try: p['lr'] = new_learning_rate except: pass except Exception as e: print( 'An exception occurred: {}\n. Training has been stopped after {} epochs.' .format(e, epoch)) informationDict['errorFlag'] = True informationDict['numEpochsTrained'] = epoch - start_epoch return model, informationDict except KeyboardInterrupt: print('User stopped training after {} epochs'.format(epoch)) informationDict['errorFlag'] = False informationDict['numEpochsTrained'] = epoch - start_epoch else: print('Finished Training in {} epochs'.format(epoch + 1)) informationDict['errorFlag'] = False informationDict['numEpochsTrained'] = epoch + 1 - start_epoch if quantizeWeights: quantize_weights_model(model) if mix_with_differentiable_quantization: informationDict['numEpochsTrained'] *= 2 informationDict['percentages_asked_teacher'] = percentages_asked_teacher informationDict['predictionAccuracy'] = pred_accuracy_epochs informationDict['lossSaved'] = losses_epochs return model, informationDict
actual_bit_huffmman = qhf.get_huffman_encoding_mean_bit_length(model.parameters(), quant_fun, 'uniform', s=2**curr_num_bit) print('Effective bit Huffman: {} - Size reduction: {}'.format(actual_bit_huffmman, mhf.get_size_reduction(actual_bit_huffmman, bucket_size=256))) if CHECK_PM_QUANTIZATION: QUANTIZE_FIRST_LAST_LAYER = False if 'distilled' in x: for numBit in numBits: for bucket_size in (None, 256): model.load_state_dict(cifar10Manager.load_model_state_dict(x)) numParam = sum(1 for _ in model.parameters()) for idx, p in enumerate(model.parameters()): if QUANTIZE_FIRST_LAST_LAYER is False: if idx == 0 or idx == numParam - 1: continue p.data = quantization.uniformQuantization(p.data, s=2**numBit, type_of_scaling='linear', bucket_size=bucket_size)[0] predAcc = cnn_hf.evaluateModel(model, test_loader, fastEvaluation=False) print('PM quantization of model "{}" with "{}" bits and {} buckets: {:2f}%'.format(x, numBit, bucket_size, predAcc * 100)) quant_fun = functools.partial(quantization.uniformQuantization, s=2**numBit, bucket_size=bucket_size) actual_bit_huffmman = qhf.get_huffman_encoding_mean_bit_length(model.parameters(), quant_fun, 'uniform',s=2**numBit) size_mb = mhf.get_size_quantized_model(model, numBit, quant_fun, bucket_size, quantizeFirstLastLayer=QUANTIZE_FIRST_LAST_LAYER) print('Effective bit Huffman: {} - Size reduction: {} - Size MB: {}'.format(actual_bit_huffmman, mhf.get_size_reduction( actual_bit_huffmman, bucket_size=bucket_size), size_mb)) distilled_model_names = ['cifar10_distilled_spec{}'.format(idx_spec) for idx_spec in range(len(smallerModelSpecs))]
def train_model(model, train_loader, test_loader, plot_path, optim=None, options=None, stochasticRounding=False, quantizeWeights=False, numBits=8, maxElementAllowedForQuantization=False, bucket_size=None, subtractMeanInQuantization=False, quantizationFunctionToUse='uniformLinearScaling', backprop_quantization_style='none', num_estimate_quant_grad=1, use_distillation_loss=False, teacher_model=None, quantize_first_and_last_layer=True): if options is None: options = copy.deepcopy(onmt.standard_options.stdOptions) if not isinstance(options, dict): options = mhf.convertToDictionary(options) options = handle_options(options) options = mhf.convertToNamedTuple(options) if optim is None: optim = create_optimizer(model, options) if use_distillation_loss is True and teacher_model is None: raise ValueError( 'If training with distilled word level, we need teacher_model to be passed' ) if teacher_model is not None: teacher_model.eval() step_since_last_grad_quant_estimation = 0 num_param_model = sum(1 for _ in model.parameters()) if quantizeWeights: quantizationFunctionToUse = quantizationFunctionToUse.lower() if quantizationFunctionToUse == 'uniformAbsMaxScaling'.lower(): s = 2**(numBits - 1) type_of_scaling = 'absmax' elif quantizationFunctionToUse == 'uniformLinearScaling'.lower(): s = 2**numBits type_of_scaling = 'linear' else: raise ValueError( 'The specified quantization function is not present') if backprop_quantization_style is None or backprop_quantization_style in ( 'none', 'truncated'): quantizeFunctions = lambda x: quantization.uniformQuantization( x, s, type_of_scaling=type_of_scaling, stochastic_rounding=stochasticRounding, max_element=maxElementAllowedForQuantization, subtract_mean=subtractMeanInQuantization, modify_in_place=False, bucket_size=bucket_size)[0] elif backprop_quantization_style == 'complicated': quantizeFunctions = [quantization.uniformQuantization_variable(s, type_of_scaling=type_of_scaling, stochastic_rounding=stochasticRounding, max_element=maxElementAllowedForQuantization, subtract_mean=subtractMeanInQuantization, modify_in_place=False, bucket_size=bucket_size) \ for _ in model.parameters()] else: raise ValueError( 'The specified backprop_quantization_style not recognized') fields = train_loader.dataset.fields # Collect features. src_features = collect_features(train_loader.dataset, fields) for j, feat in enumerate(src_features): print(' * src feature %d size = %d' % (j, len(fields[feat].vocab))) train_loss = make_loss_compute(model, fields["tgt"].vocab, train_loader.dataset, options.copy_attn, options.copy_attn_force, use_distillation_loss, teacher_model) #for validation we don't use distilled loss; it would screw up the perplexity computation valid_loss = make_loss_compute(model, fields["tgt"].vocab, test_loader.dataset, options.copy_attn, options.copy_attn_force) trunc_size = None #options.truncated_decoder # Badly named... shard_size = options.max_generator_batches trn_writer = tbx.SummaryWriter(plot_path + '_output/train') tst_writer = tbx.SummaryWriter(plot_path + '_output/test') trainer = thf.MyTrainer(model, train_loader, test_loader, train_loss, valid_loss, optim, trunc_size, shard_size) perplexity_epochs = [] for epoch in range(options.start_epoch, options.epochs + 1): MAX_Memory = 0 train_stats = onmt.Statistics() model.train() for idx_batch, batch in enumerate(train_loader): model.zero_grad() if quantizeWeights: if step_since_last_grad_quant_estimation >= num_estimate_quant_grad: # we save them because we only want to quantize weights to compute gradients, # but keep using non-quantized weights during the algorithm model_state_dict = model.state_dict() for idx, p in enumerate(model.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == num_param_model - 1: continue if backprop_quantization_style == 'truncated': p.data.clamp_( -1, 1 ) # TODO: Is this necessary? Clamping the weights? if backprop_quantization_style in ('none', 'truncated'): p.data = quantizeFunctions(p.data) elif backprop_quantization_style == 'complicated': p.data = quantizeFunctions[idx].forward(p.data) else: raise ValueError trainer.forward_and_backward(idx_batch, batch, epoch, train_stats, report_func, use_distillation_loss, teacher_model) if quantizeWeights: if step_since_last_grad_quant_estimation >= num_estimate_quant_grad: model.load_state_dict(model_state_dict) del model_state_dict # free memory if backprop_quantization_style in ('truncated', 'complicated'): for idx, p in enumerate(model.parameters()): if quantize_first_and_last_layer is False: if idx == 0 or idx == num_param_model - 1: continue #Now some sort of backward. For the none style, we don't do anything. #for the truncated style, we just need to truncate the grad weights #as per the paper here: https://arxiv.org/pdf/1609.07061.pdf #Complicated is my derivation, but unsure whether to use it or not if backprop_quantization_style == 'truncated': p.grad.data[p.data.abs() > 1] = 0 elif backprop_quantization_style == 'complicated': p.grad.data = quantizeFunctions[idx].backward( p.grad.data) #update parameters after every batch trainer.optim.step() if step_since_last_grad_quant_estimation >= num_estimate_quant_grad: step_since_last_grad_quant_estimation = 0 step_since_last_grad_quant_estimation += 1 print('Train perplexity: %g' % train_stats.ppl()) print('Train accuracy: %g' % train_stats.accuracy()) trn_writer.add_scalar('ppl', train_stats.ppl(), epoch + 1) trn_writer.add_scalar('acc', train_stats.accuracy(), epoch + 1) # 2. Validate on the validation set. MAX_Memory = max(MAX_Memory, torch.cuda.max_memory_allocated()) valid_stats = trainer.validate() print('Validation perplexity: %g' % valid_stats.ppl()) print('Validation accuracy: %g' % valid_stats.accuracy()) print('Max allocated memory: {:2f}MB'.format(MAX_Memory / (1024**2))) perplexity_epochs.append(valid_stats.ppl()) tst_writer.add_scalar('ppl', valid_stats.ppl(), epoch + 1) tst_writer.add_scalar('acc', valid_stats.accuracy(), epoch + 1) # 3. Update the learning rate trainer.epoch_step(valid_stats.ppl(), epoch) if quantizeWeights: for idx, p in enumerate(model.parameters()): if backprop_quantization_style == 'truncated': p.data.clamp_( -1, 1) # TODO: Is this necessary? Clamping the weights? if backprop_quantization_style in ('none', 'truncated'): p.data = quantizeFunctions(p.data) elif backprop_quantization_style == 'complicated': p.data = quantizeFunctions[idx].forward(p.data) del quantizeFunctions[idx].saved_for_backward quantizeFunctions[idx].saved_for_backward = None # free memory else: raise ValueError informationDict = {} informationDict['perplexity'] = perplexity_epochs informationDict[ 'numEpochsTrained'] = options.epochs + 1 - options.start_epoch return model, informationDict
currOptions = distilledOptions elif 'teacher' in x: currOptions = teacherOptions elif 'word_level' in x: currOptions = distilledOptions else: currOptions = smallerOptions currOptions['batch_size'] = 1 #important for the BLEU computation. model = tmm.create_model(dataset.fields, options=currOptions) if USE_CUDA: model = model.cuda() model.load_state_dict(onmtManager.load_model_state_dict(x)) if to_quantize: for p in model.parameters(): p.data = quantization.uniformQuantization(p.data, 2**4, bucket_size=256)[0] num_examples = 5 print('Example of translation for "{}"'.format(x)) linesToTranslate, translated_lines, referenceLines = transl_hf.get_translation_examples( model, dataset, num_examples, currOptions, standardTranslateOptions) print('Original Sentences == Translation == Ref Translation') print('\n'.join( ' == '.join(x) for x in zip(linesToTranslate, translated_lines, referenceLines))) bleu = transl_hf.get_bleu_model(model, dataset, currOptions, standardTranslateOptions) print('Model "{}" ==> Perplexity: {}, BLEU: {}'.format( x, perplexity, bleu))