Exemplo n.º 1
0
            def quantizeFunctions(x): return quantization.uniformQuantization(x, s,
                                                                              type_of_scaling=type_of_scaling,
                                                                              stochastic_rounding=False,
                                                                              max_element=False,
                                                                              subtract_mean=False,
                                                                              modify_in_place=False, bucket_size=bucket_size)[0]

        elif backprop_quantization_style == 'complicated':
def train_model(model,
                train_loader,
                test_loader,
                initial_learning_rate=0.001,
                use_nesterov=True,
                initial_momentum=0.9,
                weight_decayL2=0.00022,
                epochs_to_train=100,
                print_every=500,
                learning_rate_style='generic',
                use_distillation_loss=False,
                teacher_model=None,
                quantizeWeights=False,
                numBits=8,
                grad_clipping_threshold=False,
                start_epoch=0,
                bucket_size=None,
                quantizationFunctionToUse='uniformLinearScaling',
                backprop_quantization_style='none',
                estimate_quant_grad_every=1,
                add_gradient_noise=False,
                ask_teacher_strategy=('always', None),
                quantize_first_and_last_layer=True,
                mix_with_differentiable_quantization=False):

    # backprop_quantization_style determines how to modify the gradients to take into account the
    # quantization function. Specifically, one can use 'none', where gradients are not modified,
    # 'truncated', where gradient values outside -1 and 1 are truncated to 0 (as per the paper
    # specified in the comments) and 'complicated', which is the temp name for my idea which is slow and complicated
    # to compute

    if use_distillation_loss is True and teacher_model is None:
        raise ValueError(
            'To compute distillation loss you have to pass the teacher model')

    if teacher_model is not None:
        teacher_model.eval()

    learning_rate_style = learning_rate_style.lower()
    lr_scheduler = cnn_hf.LearningRateScheduler(initial_learning_rate,
                                                learning_rate_style)
    new_learning_rate = initial_learning_rate
    optimizer = optim.SGD(model.parameters(),
                          lr=initial_learning_rate,
                          nesterov=use_nesterov,
                          momentum=initial_momentum,
                          weight_decay=weight_decayL2)
    startTime = time.time()

    pred_accuracy_epochs = []
    percentages_asked_teacher = []
    losses_epochs = []
    informationDict = {}
    last_loss_saved = float('inf')
    step_since_last_grad_quant_estimation = 1
    number_minibatches_per_epoch = len(train_loader)

    if quantizeWeights:
        quantizationFunctionToUse = quantizationFunctionToUse.lower()
        if backprop_quantization_style is None:
            backprop_quantization_style = 'none'
        backprop_quantization_style = backprop_quantization_style.lower()
        if quantizationFunctionToUse == 'uniformAbsMaxScaling'.lower():
            s = 2**(numBits - 1)
            type_of_scaling = 'absmax'
        elif quantizationFunctionToUse == 'uniformLinearScaling'.lower():
            s = 2**numBits
            type_of_scaling = 'linear'
        else:
            raise ValueError(
                'The specified quantization function is not present')

        if backprop_quantization_style is None or backprop_quantization_style in (
                'none', 'truncated'):
            quantizeFunctions = lambda x: quantization.uniformQuantization(
                x,
                s,
                type_of_scaling=type_of_scaling,
                stochastic_rounding=False,
                max_element=False,
                subtract_mean=False,
                modify_in_place=False,
                bucket_size=bucket_size)[0]

        elif backprop_quantization_style == 'complicated':
            quantizeFunctions = [quantization.uniformQuantization_variable(s, type_of_scaling=type_of_scaling,
                                                    stochastic_rounding=False,
                                                    max_element=False,
                                                    subtract_mean=False,
                                                    modify_in_place=False, bucket_size=bucket_size) \
                                 for _ in model.parameters()]
        else:
            raise ValueError(
                'The specified backprop_quantization_style not recognized')

        num_parameters = sum(1 for _ in model.parameters())

        def quantize_weights_model(model):
            for idx, p in enumerate(model.parameters()):
                if quantize_first_and_last_layer is False:
                    if idx == 0 or idx == num_parameters - 1:
                        continue  #don't quantize first and last layer
                if backprop_quantization_style == 'truncated':
                    p.data.clamp_(-1, 1)
                if backprop_quantization_style in ('none', 'truncated'):
                    p.data = quantizeFunctions(p.data)
                elif backprop_quantization_style == 'complicated':
                    p.data = quantizeFunctions[idx].forward(p.data)
                else:
                    raise ValueError

        def backward_quant_weights_model(model):
            if backprop_quantization_style == 'none':
                return

            for idx, p in enumerate(model.parameters()):
                if quantize_first_and_last_layer is False:
                    if idx == 0 or idx == num_parameters - 1:
                        continue  #don't quantize first and last layer

                # Now some sort of backward. For the none style, we don't do anything.
                # for the truncated style, we just need to truncate the grad weights
                # as per the paper here: https://arxiv.org/pdf/1609.07061.pdf
                # if we are quantizing, I put gradient values above 1 to 0.
                # their case it not immediately applicable to ours, but let's try this out
                if backprop_quantization_style == 'truncated':
                    p.grad.data[p.data.abs() > 1] = 0
                elif backprop_quantization_style == 'complicated':
                    p.grad.data = quantizeFunctions[idx].backward(p.grad.data)

    if print_every > number_minibatches_per_epoch:
        print_every = number_minibatches_per_epoch // 2

    try:
        epoch = start_epoch
        for epoch in range(start_epoch, epochs_to_train + start_epoch):
            print("begin training")
            if USE_CUDA:
                print("USE_CUDA")
            if mix_with_differentiable_quantization:
                print('=== Starting Quantized Distillation epoch === ')
            model.train()
            print_loss_total = 0
            count_asked_teacher = 0
            count_asked_total = 0
            for idx_minibatch, data in enumerate(train_loader, start=1):

                if quantizeWeights:
                    if step_since_last_grad_quant_estimation >= estimate_quant_grad_every:
                        # we save them because we only want to quantize weights to compute gradients,
                        # but keep using non-quantized weights during the algorithm
                        model_state_dict = model.state_dict()
                        quantize_weights_model(model)

                model.zero_grad()
                print_loss, curr_c_teach, curr_c_total = cnn_hf.forward_and_backward(
                    model,
                    data,
                    idx_minibatch,
                    epoch,
                    use_distillation_loss=use_distillation_loss,
                    teacher_model=teacher_model,
                    ask_teacher_strategy=ask_teacher_strategy,
                    return_more_info=True)
                count_asked_teacher += curr_c_teach
                count_asked_total += curr_c_total

                #load the non-quantize weights and use them for the update. The quantized
                #weights are used only to get the quantized gradient
                if quantizeWeights:
                    if step_since_last_grad_quant_estimation >= estimate_quant_grad_every:
                        model.load_state_dict(model_state_dict)
                        del model_state_dict  #free memory

                if add_gradient_noise and not quantizeWeights:
                    cnn_hf.add_gradient_noise(model, idx_minibatch, epoch,
                                              number_minibatches_per_epoch)

                if grad_clipping_threshold is not False:
                    # gradient clipping
                    for p in model.parameters():
                        p.grad.data.clamp_(-grad_clipping_threshold,
                                           grad_clipping_threshold)

                if quantizeWeights:
                    if step_since_last_grad_quant_estimation >= estimate_quant_grad_every:
                        backward_quant_weights_model(model)

                optimizer.step()

                if step_since_last_grad_quant_estimation >= estimate_quant_grad_every:
                    step_since_last_grad_quant_estimation = 0

                step_since_last_grad_quant_estimation += 1

                # print statistics
                print_loss_total += print_loss
                if (idx_minibatch) % print_every == 0:
                    last_loss_saved = print_loss_total / print_every
                    str_to_print = 'Time Elapsed: {}, [Start Epoch: {}, Epoch: {}, Minibatch: {}], loss: {:3f}'.format(
                        mhf.timeSince(startTime), start_epoch + 1, epoch + 1,
                        idx_minibatch, last_loss_saved)
                    if pred_accuracy_epochs:
                        str_to_print += ' Last prediction accuracy: {:2f}%'.format(
                            pred_accuracy_epochs[-1] * 100)
                    print(str_to_print)
                    print_loss_total = 0

            curr_percentages_asked_teacher = count_asked_teacher / count_asked_total if count_asked_total != 0 else 0
            percentages_asked_teacher.append(curr_percentages_asked_teacher)
            losses_epochs.append(last_loss_saved)
            curr_pred_accuracy = cnn_hf.evaluateModel(model,
                                                      test_loader,
                                                      fastEvaluation=False)
            pred_accuracy_epochs.append(curr_pred_accuracy)
            print(' === Epoch: {} - prediction accuracy {:2f}% === '.format(
                epoch + 1, curr_pred_accuracy * 100))

            if mix_with_differentiable_quantization and epoch != start_epoch + epochs_to_train - 1:
                print('=== Starting Differentiable Quantization epoch === ')
                #the diff quant step is not done at the last epoch, so we end on a quantized distillation epoch
                model_state_dict = optimize_quantization_points(
                    model,
                    train_loader,
                    test_loader,
                    new_learning_rate,
                    initial_momentum=initial_momentum,
                    epochs_to_train=1,
                    print_every=print_every,
                    use_nesterov=use_nesterov,
                    learning_rate_style=learning_rate_style,
                    numPointsPerTensor=2**numBits,
                    assignBitsAutomatically=True,
                    bucket_size=bucket_size,
                    use_distillation_loss=True,
                    initialize_method='quantiles',
                    quantize_first_and_last_layer=quantize_first_and_last_layer
                )[0]
                model.load_state_dict(model_state_dict)
                del model_state_dict  # free memory
                losses_epochs.append(last_loss_saved)
                curr_pred_accuracy = cnn_hf.evaluateModel(model,
                                                          test_loader,
                                                          fastEvaluation=False)
                pred_accuracy_epochs.append(curr_pred_accuracy)
                print(
                    ' === Epoch: {} - prediction accuracy {:2f}% === '.format(
                        epoch + 1, curr_pred_accuracy * 100))

            #updating the learning rate
            new_learning_rate, stop_training = lr_scheduler.update_learning_rate(
                epoch, 1 - curr_pred_accuracy)
            if stop_training is True:
                break
            for p in optimizer.param_groups:
                try:
                    p['lr'] = new_learning_rate
                except:
                    pass

    except Exception as e:
        print(
            'An exception occurred: {}\n. Training has been stopped after {} epochs.'
            .format(e, epoch))
        informationDict['errorFlag'] = True
        informationDict['numEpochsTrained'] = epoch - start_epoch

        return model, informationDict
    except KeyboardInterrupt:
        print('User stopped training after {} epochs'.format(epoch))
        informationDict['errorFlag'] = False
        informationDict['numEpochsTrained'] = epoch - start_epoch
    else:
        print('Finished Training in {} epochs'.format(epoch + 1))
        informationDict['errorFlag'] = False
        informationDict['numEpochsTrained'] = epoch + 1 - start_epoch

    if quantizeWeights:
        quantize_weights_model(model)

    if mix_with_differentiable_quantization:
        informationDict['numEpochsTrained'] *= 2

    informationDict['percentages_asked_teacher'] = percentages_asked_teacher
    informationDict['predictionAccuracy'] = pred_accuracy_epochs
    informationDict['lossSaved'] = losses_epochs
    return model, informationDict
Exemplo n.º 3
0
        actual_bit_huffmman = qhf.get_huffman_encoding_mean_bit_length(model.parameters(), quant_fun,
                                                                       'uniform', s=2**curr_num_bit)
        print('Effective bit Huffman: {} - Size reduction: {}'.format(actual_bit_huffmman,
                                            mhf.get_size_reduction(actual_bit_huffmman, bucket_size=256)))
    if CHECK_PM_QUANTIZATION:
        QUANTIZE_FIRST_LAST_LAYER = False
        if 'distilled' in x:
            for numBit in numBits:
                for bucket_size in (None, 256):
                    model.load_state_dict(cifar10Manager.load_model_state_dict(x))
                    numParam = sum(1 for _ in model.parameters())
                    for idx, p in enumerate(model.parameters()):
                        if QUANTIZE_FIRST_LAST_LAYER is False:
                            if idx == 0 or idx == numParam - 1:
                                continue
                        p.data = quantization.uniformQuantization(p.data, s=2**numBit, type_of_scaling='linear',
                                                                  bucket_size=bucket_size)[0]
                    predAcc = cnn_hf.evaluateModel(model, test_loader, fastEvaluation=False)
                    print('PM quantization of model "{}" with "{}" bits and {} buckets: {:2f}%'.format(x, numBit,
                                                                                            bucket_size, predAcc * 100))
                    quant_fun = functools.partial(quantization.uniformQuantization, s=2**numBit, bucket_size=bucket_size)
                    actual_bit_huffmman = qhf.get_huffman_encoding_mean_bit_length(model.parameters(), quant_fun,
                                                                                   'uniform',s=2**numBit)
                    size_mb = mhf.get_size_quantized_model(model, numBit, quant_fun, bucket_size,
                                                           quantizeFirstLastLayer=QUANTIZE_FIRST_LAST_LAYER)
                    print('Effective bit Huffman: {} - Size reduction: {} - Size MB: {}'.format(actual_bit_huffmman,
                                                                                  mhf.get_size_reduction(
                                                                                      actual_bit_huffmman,
                                                                                      bucket_size=bucket_size),
                                                                                       size_mb))

distilled_model_names = ['cifar10_distilled_spec{}'.format(idx_spec) for idx_spec in range(len(smallerModelSpecs))]
Exemplo n.º 4
0
def train_model(model,
                train_loader,
                test_loader,
                plot_path,
                optim=None,
                options=None,
                stochasticRounding=False,
                quantizeWeights=False,
                numBits=8,
                maxElementAllowedForQuantization=False,
                bucket_size=None,
                subtractMeanInQuantization=False,
                quantizationFunctionToUse='uniformLinearScaling',
                backprop_quantization_style='none',
                num_estimate_quant_grad=1,
                use_distillation_loss=False,
                teacher_model=None,
                quantize_first_and_last_layer=True):

    if options is None:
        options = copy.deepcopy(onmt.standard_options.stdOptions)
    if not isinstance(options, dict):
        options = mhf.convertToDictionary(options)
    options = handle_options(options)
    options = mhf.convertToNamedTuple(options)

    if optim is None:
        optim = create_optimizer(model, options)

    if use_distillation_loss is True and teacher_model is None:
        raise ValueError(
            'If training with distilled word level, we need teacher_model to be passed'
        )

    if teacher_model is not None:
        teacher_model.eval()

    step_since_last_grad_quant_estimation = 0
    num_param_model = sum(1 for _ in model.parameters())
    if quantizeWeights:
        quantizationFunctionToUse = quantizationFunctionToUse.lower()
        if quantizationFunctionToUse == 'uniformAbsMaxScaling'.lower():
            s = 2**(numBits - 1)
            type_of_scaling = 'absmax'
        elif quantizationFunctionToUse == 'uniformLinearScaling'.lower():
            s = 2**numBits
            type_of_scaling = 'linear'
        else:
            raise ValueError(
                'The specified quantization function is not present')

        if backprop_quantization_style is None or backprop_quantization_style in (
                'none', 'truncated'):
            quantizeFunctions = lambda x: quantization.uniformQuantization(
                x,
                s,
                type_of_scaling=type_of_scaling,
                stochastic_rounding=stochasticRounding,
                max_element=maxElementAllowedForQuantization,
                subtract_mean=subtractMeanInQuantization,
                modify_in_place=False,
                bucket_size=bucket_size)[0]

        elif backprop_quantization_style == 'complicated':
            quantizeFunctions = [quantization.uniformQuantization_variable(s, type_of_scaling=type_of_scaling,
                                                    stochastic_rounding=stochasticRounding,
                                                    max_element=maxElementAllowedForQuantization,
                                                    subtract_mean=subtractMeanInQuantization,
                                                    modify_in_place=False, bucket_size=bucket_size) \
                                 for _ in model.parameters()]
        else:
            raise ValueError(
                'The specified backprop_quantization_style not recognized')

    fields = train_loader.dataset.fields
    # Collect features.
    src_features = collect_features(train_loader.dataset, fields)
    for j, feat in enumerate(src_features):
        print(' * src feature %d size = %d' % (j, len(fields[feat].vocab)))

    train_loss = make_loss_compute(model, fields["tgt"].vocab,
                                   train_loader.dataset, options.copy_attn,
                                   options.copy_attn_force,
                                   use_distillation_loss, teacher_model)
    #for validation we don't use distilled loss; it would screw up the perplexity computation
    valid_loss = make_loss_compute(model, fields["tgt"].vocab,
                                   test_loader.dataset, options.copy_attn,
                                   options.copy_attn_force)

    trunc_size = None  #options.truncated_decoder  # Badly named...
    shard_size = options.max_generator_batches

    trn_writer = tbx.SummaryWriter(plot_path + '_output/train')
    tst_writer = tbx.SummaryWriter(plot_path + '_output/test')

    trainer = thf.MyTrainer(model, train_loader, test_loader, train_loss,
                            valid_loss, optim, trunc_size, shard_size)

    perplexity_epochs = []
    for epoch in range(options.start_epoch, options.epochs + 1):
        MAX_Memory = 0
        train_stats = onmt.Statistics()
        model.train()
        for idx_batch, batch in enumerate(train_loader):

            model.zero_grad()

            if quantizeWeights:
                if step_since_last_grad_quant_estimation >= num_estimate_quant_grad:
                    # we save them because we only want to quantize weights to compute gradients,
                    # but keep using non-quantized weights during the algorithm
                    model_state_dict = model.state_dict()
                    for idx, p in enumerate(model.parameters()):
                        if quantize_first_and_last_layer is False:
                            if idx == 0 or idx == num_param_model - 1:
                                continue
                        if backprop_quantization_style == 'truncated':
                            p.data.clamp_(
                                -1, 1
                            )  # TODO: Is this necessary? Clamping the weights?
                        if backprop_quantization_style in ('none',
                                                           'truncated'):
                            p.data = quantizeFunctions(p.data)
                        elif backprop_quantization_style == 'complicated':
                            p.data = quantizeFunctions[idx].forward(p.data)
                        else:
                            raise ValueError
            trainer.forward_and_backward(idx_batch, batch, epoch, train_stats,
                                         report_func, use_distillation_loss,
                                         teacher_model)

            if quantizeWeights:
                if step_since_last_grad_quant_estimation >= num_estimate_quant_grad:
                    model.load_state_dict(model_state_dict)
                    del model_state_dict  # free memory

                    if backprop_quantization_style in ('truncated',
                                                       'complicated'):
                        for idx, p in enumerate(model.parameters()):
                            if quantize_first_and_last_layer is False:
                                if idx == 0 or idx == num_param_model - 1:
                                    continue
                            #Now some sort of backward. For the none style, we don't do anything.
                            #for the truncated style, we just need to truncate the grad weights
                            #as per the paper here: https://arxiv.org/pdf/1609.07061.pdf
                            #Complicated is my derivation, but unsure whether to use it or not
                            if backprop_quantization_style == 'truncated':
                                p.grad.data[p.data.abs() > 1] = 0
                            elif backprop_quantization_style == 'complicated':
                                p.grad.data = quantizeFunctions[idx].backward(
                                    p.grad.data)

            #update parameters after every batch
            trainer.optim.step()

            if step_since_last_grad_quant_estimation >= num_estimate_quant_grad:
                step_since_last_grad_quant_estimation = 0

            step_since_last_grad_quant_estimation += 1

        print('Train perplexity: %g' % train_stats.ppl())
        print('Train accuracy: %g' % train_stats.accuracy())

        trn_writer.add_scalar('ppl', train_stats.ppl(), epoch + 1)
        trn_writer.add_scalar('acc', train_stats.accuracy(), epoch + 1)

        # 2. Validate on the validation set.
        MAX_Memory = max(MAX_Memory, torch.cuda.max_memory_allocated())
        valid_stats = trainer.validate()
        print('Validation perplexity: %g' % valid_stats.ppl())
        print('Validation accuracy: %g' % valid_stats.accuracy())
        print('Max allocated memory: {:2f}MB'.format(MAX_Memory / (1024**2)))
        perplexity_epochs.append(valid_stats.ppl())

        tst_writer.add_scalar('ppl', valid_stats.ppl(), epoch + 1)
        tst_writer.add_scalar('acc', valid_stats.accuracy(), epoch + 1)

        # 3. Update the learning rate
        trainer.epoch_step(valid_stats.ppl(), epoch)

    if quantizeWeights:
        for idx, p in enumerate(model.parameters()):
            if backprop_quantization_style == 'truncated':
                p.data.clamp_(
                    -1, 1)  # TODO: Is this necessary? Clamping the weights?
            if backprop_quantization_style in ('none', 'truncated'):
                p.data = quantizeFunctions(p.data)
            elif backprop_quantization_style == 'complicated':
                p.data = quantizeFunctions[idx].forward(p.data)
                del quantizeFunctions[idx].saved_for_backward
                quantizeFunctions[idx].saved_for_backward = None  # free memory
            else:
                raise ValueError

    informationDict = {}
    informationDict['perplexity'] = perplexity_epochs
    informationDict[
        'numEpochsTrained'] = options.epochs + 1 - options.start_epoch
    return model, informationDict
Exemplo n.º 5
0
        currOptions = distilledOptions
    elif 'teacher' in x:
        currOptions = teacherOptions
    elif 'word_level' in x:
        currOptions = distilledOptions
    else:
        currOptions = smallerOptions

    currOptions['batch_size'] = 1  #important for the BLEU computation.
    model = tmm.create_model(dataset.fields, options=currOptions)
    if USE_CUDA: model = model.cuda()
    model.load_state_dict(onmtManager.load_model_state_dict(x))
    if to_quantize:
        for p in model.parameters():
            p.data = quantization.uniformQuantization(p.data,
                                                      2**4,
                                                      bucket_size=256)[0]

    num_examples = 5
    print('Example of translation for "{}"'.format(x))
    linesToTranslate, translated_lines, referenceLines = transl_hf.get_translation_examples(
        model, dataset, num_examples, currOptions, standardTranslateOptions)
    print('Original Sentences == Translation == Ref Translation')
    print('\n'.join(
        ' == '.join(x)
        for x in zip(linesToTranslate, translated_lines, referenceLines)))
    bleu = transl_hf.get_bleu_model(model, dataset, currOptions,
                                    standardTranslateOptions)
    print('Model "{}"  ==> Perplexity: {}, BLEU: {}'.format(
        x, perplexity, bleu))