示例#1
0
    def __init__(self, model, fields, model_options, translate_options):
        # Add in default model arguments, possibly added since training.
        # Note: The fields here should contain src_map, so it is not what the translation_dataset loader does
        # as it removes those fields. Be careful
        self.opt = mhf.convertToNamedTuple(translate_options)
        self.fields = fields
        model_opt = model_options
        for arg in translate_options:
            if arg not in model_opt:
                model_opt[arg] = translate_options[arg]

        model_opt = mhf.convertToNamedTuple(model_opt)

        self._type = model_opt.model_type
        self.copy_attn = model_opt.copy_attn

        self.model = model
        self.model.eval()
        self.model.generator.eval()

        # for debugging
        self.beam_accum = None
示例#2
0
def create_model(fields, options=None):
    if options is None:
        options = copy.deepcopy(onmt.standard_options.stdOptions)
    if not isinstance(options, dict):
        options = mhf.convertToDictionary(options)
    options = handle_options(options)
    options = mhf.convertToNamedTuple(options)
    model = onmt.ModelConstructor.make_base_model(options,
                                                  fields,
                                                  USE_CUDA,
                                                  checkpoint=None)
    if len(options.gpuid) > 1:
        model = nn.DataParallel(model, device_ids=options.gpuid, dim=1)

    return model
示例#3
0
def create_optimizer(model_or_iterable, options=None):
    if options is None:
        options = copy.deepcopy(onmt.standard_options.stdOptions)
    if not isinstance(options, dict):
        options = mhf.convertToDictionary(options)
    options = handle_options(options)
    options = mhf.convertToNamedTuple(options)
    optim = onmt.Optim(options.optim,
                       options.learning_rate,
                       options.max_grad_norm,
                       lr_decay=options.learning_rate_decay,
                       start_decay_at=options.start_decay_at,
                       opt=options)

    try:
        optim.set_parameters(model_or_iterable.parameters())
    except AttributeError:
        optim.set_parameters(model_or_iterable)
    return optim
示例#4
0
    def process_dataset(self):
        stdProcessOptions = onmt.standard_options.standardPreProcessingOptions
        stdProcessOptions = mhf.convertToNamedTuple(stdProcessOptions)

        print('Preparing Training...')

        with codecs.open(self.trainFilesPath[0], "r", "utf-8") as src_file:
            src_line = src_file.readline().strip().split()
            _, _, nFeatures = onmt.IO.extract_features(src_line)

        fields = onmt.IO.ONMTDataset.get_fields(nFeatures)
        print("Building Training...")
        train = onmt.IO.ONMTDataset(self.trainFilesPath[0],
                                    self.trainFilesPath[1], fields,
                                    stdProcessOptions)
        print("Building Vocab...")
        onmt.IO.ONMTDataset.build_vocab(train, stdProcessOptions)

        print("Building Test...")
        test = onmt.IO.ONMTDataset(self.testFilesPath[0],
                                   self.testFilesPath[1], fields,
                                   stdProcessOptions)
        print("Saving train/test/fields")

        # Can't save fields, so remove/reconstruct at training time.
        with open(self.processedFilesPath[0], 'wb') as processed_vocab, \
             open(self.processedFilesPath[1], 'wb') as processed_train, \
             open(self.processedFilesPath[2], 'wb') as processed_test:

            torch.save(onmt.IO.ONMTDataset.save_vocab(fields), processed_vocab)
            train.fields = []
            test.fields = []
            torch.save(train, processed_train)
            torch.save(test, processed_test)

        print('Saving done.')
示例#5
0
def optimize_quantization_points(modelToQuantize,
                                 train_loader,
                                 test_loader,
                                 options,
                                 optim=None,
                                 numPointsPerTensor=16,
                                 assignBitsAutomatically=False,
                                 use_distillation_loss=False,
                                 bucket_size=None):

    print('Preparing training - pre processing tensors')

    if options is None: options = onmt.standard_options.stdOptions
    if not isinstance(options, dict):
        options = mhf.convertToDictionary(options)
    options = handle_options(options)
    options = mhf.convertToNamedTuple(options)

    modelToQuantize.eval()
    quantizedModel = copy.deepcopy(modelToQuantize)

    fields = train_loader.dataset.fields
    train_loss = make_loss_compute(quantizedModel, fields["tgt"].vocab,
                                   train_loader.dataset, options.copy_attn,
                                   options.copy_attn_force)
    valid_loss = make_loss_compute(quantizedModel, fields["tgt"].vocab,
                                   test_loader.dataset, options.copy_attn,
                                   options.copy_attn_force)
    trunc_size = options.truncated_decoder  # Badly named...
    shard_size = options.max_generator_batches

    numTensorsNetwork = sum(1 for _ in quantizedModel.parameters())
    if isinstance(numPointsPerTensor, int):
        numPointsPerTensor = [numPointsPerTensor] * numTensorsNetwork
    if len(numPointsPerTensor) != numTensorsNetwork:
        raise ValueError(
            'numPointsPerTensor must be equal to the number of tensor in the network'
        )

    scalingFunction = quantization.ScalingFunction(type_scaling='linear',
                                                   max_element=False,
                                                   subtract_mean=False,
                                                   modify_in_place=False,
                                                   bucket_size=bucket_size)

    quantizedModel.zero_grad()
    dummy_optim = create_optimizer(
        quantizedModel, options)  #dummy optim, just to pass to trainer
    if assignBitsAutomatically:
        trainer = thf.MyTrainer(quantizedModel, train_loader, test_loader,
                                train_loss, valid_loss, dummy_optim,
                                trunc_size, shard_size)
        batch = next(iter(train_loader))
        quantizedModel.zero_grad()
        trainer.forward_and_backward(0, batch, 0, onmt.Statistics(), None)
        fisherInformation = []
        for p in quantizedModel.parameters():
            fisherInformation.append(p.grad.data.norm())
        numPointsPerTensor = qhf.assign_bits_automatically(fisherInformation,
                                                           numPointsPerTensor,
                                                           input_is_point=True)
        quantizedModel.zero_grad()
        del trainer
        del optim

    # initialize the points using the percentile function so as to make them all usable
    pointsPerTensor = []
    for idx, p in enumerate(quantizedModel.parameters()):
        initial_points = qhf.initialize_quantization_points(
            p.data, scalingFunction, numPointsPerTensor[idx])
        initial_points = Variable(initial_points, requires_grad=True)
        # do a dummy backprop so that the grad attribute is initialized. We need this because we call
        # the .backward() function manually later on (since pytorch can't assign variables to model
        # parameters)
        initial_points.sum().backward()
        pointsPerTensor.append(initial_points)

    optionsOpt = copy.deepcopy(mhf.convertToDictionary(options))
    optimizer = create_optimizer(pointsPerTensor,
                                 mhf.convertToNamedTuple(optionsOpt))
    trainer = thf.MyTrainer(quantizedModel, train_loader, test_loader,
                            train_loss, valid_loss, dummy_optim, trunc_size,
                            shard_size)
    perplexity_epochs = []

    quantizationFunctions = []
    for idx, p in enumerate(modelToQuantize.parameters()):
        #efficient version of nonUniformQuantization
        quant_fun = quantization.nonUniformQuantization_variable(
            max_element=False,
            subtract_mean=False,
            modify_in_place=False,
            bucket_size=bucket_size,
            pre_process_tensors=True,
            tensor=p.data)

        quantizationFunctions.append(quant_fun)

    print('Pre processing done, training started')

    for epoch in range(options.start_epoch, options.epochs + 1):
        train_stats = onmt.Statistics()
        quantizedModel.train()
        for idx_batch, batch in enumerate(train_loader):

            #zero the gradient
            quantizedModel.zero_grad()

            # quantize the weights
            for idx, p_quantized in enumerate(quantizedModel.parameters()):
                #I am using the efficient version of nonUniformQuantization. The tensors (that don't change across
                #iterations) are saved inside the quantization function, and we only need to pass the quantization
                #points
                p_quantized.data = quantizationFunctions[idx].forward(
                    None, pointsPerTensor[idx].data)

            trainer.forward_and_backward(idx_batch, batch, epoch, train_stats,
                                         report_func, use_distillation_loss,
                                         modelToQuantize)

            # now get the gradient of the pointsPerTensor
            for idx, p in enumerate(quantizedModel.parameters()):
                pointsPerTensor[idx].grad.data = quantizationFunctions[
                    idx].backward(p.grad.data)[1]

            optimizer.step()

            # after optimzer.step() we need to make sure that the points are still sorted
            for points in pointsPerTensor:
                points.data = torch.sort(points.data)[0]

        print('Train perplexity: %g' % train_stats.ppl())
        print('Train accuracy: %g' % train_stats.accuracy())

        # 2. Validate on the validation set.
        valid_stats = trainer.validate()
        print('Validation perplexity: %g' % valid_stats.ppl())
        print('Validation accuracy: %g' % valid_stats.accuracy())
        perplexity_epochs.append(valid_stats.ppl())

        # 3. Update the learning rate
        optimizer.updateLearningRate(valid_stats.ppl(), epoch)

    informationDict = {}
    informationDict['perplexity'] = perplexity_epochs
    informationDict[
        'numEpochsTrained'] = options.epochs + 1 - options.start_epoch
    return pointsPerTensor, informationDict
示例#6
0
def train_model(model,
                train_loader,
                test_loader,
                plot_path,
                optim=None,
                options=None,
                stochasticRounding=False,
                quantizeWeights=False,
                numBits=8,
                maxElementAllowedForQuantization=False,
                bucket_size=None,
                subtractMeanInQuantization=False,
                quantizationFunctionToUse='uniformLinearScaling',
                backprop_quantization_style='none',
                num_estimate_quant_grad=1,
                use_distillation_loss=False,
                teacher_model=None,
                quantize_first_and_last_layer=True):

    if options is None:
        options = copy.deepcopy(onmt.standard_options.stdOptions)
    if not isinstance(options, dict):
        options = mhf.convertToDictionary(options)
    options = handle_options(options)
    options = mhf.convertToNamedTuple(options)

    if optim is None:
        optim = create_optimizer(model, options)

    if use_distillation_loss is True and teacher_model is None:
        raise ValueError(
            'If training with distilled word level, we need teacher_model to be passed'
        )

    if teacher_model is not None:
        teacher_model.eval()

    step_since_last_grad_quant_estimation = 0
    num_param_model = sum(1 for _ in model.parameters())
    if quantizeWeights:
        quantizationFunctionToUse = quantizationFunctionToUse.lower()
        if quantizationFunctionToUse == 'uniformAbsMaxScaling'.lower():
            s = 2**(numBits - 1)
            type_of_scaling = 'absmax'
        elif quantizationFunctionToUse == 'uniformLinearScaling'.lower():
            s = 2**numBits
            type_of_scaling = 'linear'
        else:
            raise ValueError(
                'The specified quantization function is not present')

        if backprop_quantization_style is None or backprop_quantization_style in (
                'none', 'truncated'):
            quantizeFunctions = lambda x: quantization.uniformQuantization(
                x,
                s,
                type_of_scaling=type_of_scaling,
                stochastic_rounding=stochasticRounding,
                max_element=maxElementAllowedForQuantization,
                subtract_mean=subtractMeanInQuantization,
                modify_in_place=False,
                bucket_size=bucket_size)[0]

        elif backprop_quantization_style == 'complicated':
            quantizeFunctions = [quantization.uniformQuantization_variable(s, type_of_scaling=type_of_scaling,
                                                    stochastic_rounding=stochasticRounding,
                                                    max_element=maxElementAllowedForQuantization,
                                                    subtract_mean=subtractMeanInQuantization,
                                                    modify_in_place=False, bucket_size=bucket_size) \
                                 for _ in model.parameters()]
        else:
            raise ValueError(
                'The specified backprop_quantization_style not recognized')

    fields = train_loader.dataset.fields
    # Collect features.
    src_features = collect_features(train_loader.dataset, fields)
    for j, feat in enumerate(src_features):
        print(' * src feature %d size = %d' % (j, len(fields[feat].vocab)))

    train_loss = make_loss_compute(model, fields["tgt"].vocab,
                                   train_loader.dataset, options.copy_attn,
                                   options.copy_attn_force,
                                   use_distillation_loss, teacher_model)
    #for validation we don't use distilled loss; it would screw up the perplexity computation
    valid_loss = make_loss_compute(model, fields["tgt"].vocab,
                                   test_loader.dataset, options.copy_attn,
                                   options.copy_attn_force)

    trunc_size = None  #options.truncated_decoder  # Badly named...
    shard_size = options.max_generator_batches

    trn_writer = tbx.SummaryWriter(plot_path + '_output/train')
    tst_writer = tbx.SummaryWriter(plot_path + '_output/test')

    trainer = thf.MyTrainer(model, train_loader, test_loader, train_loss,
                            valid_loss, optim, trunc_size, shard_size)

    perplexity_epochs = []
    for epoch in range(options.start_epoch, options.epochs + 1):
        MAX_Memory = 0
        train_stats = onmt.Statistics()
        model.train()
        for idx_batch, batch in enumerate(train_loader):

            model.zero_grad()

            if quantizeWeights:
                if step_since_last_grad_quant_estimation >= num_estimate_quant_grad:
                    # we save them because we only want to quantize weights to compute gradients,
                    # but keep using non-quantized weights during the algorithm
                    model_state_dict = model.state_dict()
                    for idx, p in enumerate(model.parameters()):
                        if quantize_first_and_last_layer is False:
                            if idx == 0 or idx == num_param_model - 1:
                                continue
                        if backprop_quantization_style == 'truncated':
                            p.data.clamp_(
                                -1, 1
                            )  # TODO: Is this necessary? Clamping the weights?
                        if backprop_quantization_style in ('none',
                                                           'truncated'):
                            p.data = quantizeFunctions(p.data)
                        elif backprop_quantization_style == 'complicated':
                            p.data = quantizeFunctions[idx].forward(p.data)
                        else:
                            raise ValueError
            trainer.forward_and_backward(idx_batch, batch, epoch, train_stats,
                                         report_func, use_distillation_loss,
                                         teacher_model)

            if quantizeWeights:
                if step_since_last_grad_quant_estimation >= num_estimate_quant_grad:
                    model.load_state_dict(model_state_dict)
                    del model_state_dict  # free memory

                    if backprop_quantization_style in ('truncated',
                                                       'complicated'):
                        for idx, p in enumerate(model.parameters()):
                            if quantize_first_and_last_layer is False:
                                if idx == 0 or idx == num_param_model - 1:
                                    continue
                            #Now some sort of backward. For the none style, we don't do anything.
                            #for the truncated style, we just need to truncate the grad weights
                            #as per the paper here: https://arxiv.org/pdf/1609.07061.pdf
                            #Complicated is my derivation, but unsure whether to use it or not
                            if backprop_quantization_style == 'truncated':
                                p.grad.data[p.data.abs() > 1] = 0
                            elif backprop_quantization_style == 'complicated':
                                p.grad.data = quantizeFunctions[idx].backward(
                                    p.grad.data)

            #update parameters after every batch
            trainer.optim.step()

            if step_since_last_grad_quant_estimation >= num_estimate_quant_grad:
                step_since_last_grad_quant_estimation = 0

            step_since_last_grad_quant_estimation += 1

        print('Train perplexity: %g' % train_stats.ppl())
        print('Train accuracy: %g' % train_stats.accuracy())

        trn_writer.add_scalar('ppl', train_stats.ppl(), epoch + 1)
        trn_writer.add_scalar('acc', train_stats.accuracy(), epoch + 1)

        # 2. Validate on the validation set.
        MAX_Memory = max(MAX_Memory, torch.cuda.max_memory_allocated())
        valid_stats = trainer.validate()
        print('Validation perplexity: %g' % valid_stats.ppl())
        print('Validation accuracy: %g' % valid_stats.accuracy())
        print('Max allocated memory: {:2f}MB'.format(MAX_Memory / (1024**2)))
        perplexity_epochs.append(valid_stats.ppl())

        tst_writer.add_scalar('ppl', valid_stats.ppl(), epoch + 1)
        tst_writer.add_scalar('acc', valid_stats.accuracy(), epoch + 1)

        # 3. Update the learning rate
        trainer.epoch_step(valid_stats.ppl(), epoch)

    if quantizeWeights:
        for idx, p in enumerate(model.parameters()):
            if backprop_quantization_style == 'truncated':
                p.data.clamp_(
                    -1, 1)  # TODO: Is this necessary? Clamping the weights?
            if backprop_quantization_style in ('none', 'truncated'):
                p.data = quantizeFunctions(p.data)
            elif backprop_quantization_style == 'complicated':
                p.data = quantizeFunctions[idx].forward(p.data)
                del quantizeFunctions[idx].saved_for_backward
                quantizeFunctions[idx].saved_for_backward = None  # free memory
            else:
                raise ValueError

    informationDict = {}
    informationDict['perplexity'] = perplexity_epochs
    informationDict[
        'numEpochsTrained'] = options.epochs + 1 - options.start_epoch
    return model, informationDict
示例#7
0
def translate_sequences(model, vocab_file, model_options, translate_options, source_to_translate, out_file_path=None,
                        verbose=True, percentages_to_show=25, replace_unk_hack=False, test_memory=False):

    'translates phrases using the model passed as parameter'

    fields = onmt.IO.ONMTDataset.load_fields(torch.load(vocab_file))
    translator = Translator_modified(model, fields, model_options, translate_options)
    for key, val in onmt.standard_options.standardPreProcessingOptions.items():
        if key not in translate_options:
            translate_options[key] = val

    translate_options = mhf.convertToNamedTuple(translate_options)
    model_options = mhf.convertToNamedTuple(model_options)

    iterator_created = False
    dataset_created = False
    is_source_file = False
    #source_to_translate can be different things:
    # - A ONMTDataset (must have been created with options=None though!)
    # - An OrderedIterator (must also have been created with options=None)
    # - A path to a file
    # - A string of phrases separated by \n
    # - A list of strings

    if isinstance(source_to_translate, onmt.IO.ONMTDataset):
        dataset = source_to_translate
        dataset_created = True
    elif isinstance(source_to_translate, onmt.IO.OrderedIterator):
        data_generator = source_to_translate
        dataset = source_to_translate.dataset
        dataset_created = True
        iterator_created = True
    elif isinstance(source_to_translate, str):
        if os.path.exists(source_to_translate):
            dataset = onmt.IO.ONMTDataset(source_to_translate, source_to_translate, translator.fields, None)
            dataset_created = True
            is_source_file, source_file_path = True, source_to_translate
        else:
            source_to_translate = re.sub('\n+', '\n', source_to_translate)
            source_to_translate = [x for x in source_to_translate.split('\n') if x]

    if isinstance(source_to_translate, list):
        temp_file_path = os.path.abspath('temp_file_translate_pytorch_{}'.format(uuid.uuid4()))
        with open(temp_file_path, 'w') as temp_file:
            for line in source_to_translate:
                temp_file.write(line + '\n')
        is_source_file, source_file_path = True, temp_file_path
        dataset = onmt.IO.ONMTDataset(temp_file_path, temp_file_path, translator.fields, None)
        dataset_created = True

    if not dataset_created:
        raise ValueError('source_to_translate could not have been interpreted correctly')

    if not iterator_created:
        data_generator = onmt.IO.OrderedIterator(dataset=dataset, #device=model_options.gpu,
                                                 batch_size=translate_options.batch_size,
                                                 train=False, sort=False, shuffle=False)

    if out_file_path is None:
        res = ''
    else:
        out_file = open(out_file_path, 'w')

    next_percentage_to_show = percentages_to_show
    total_num_batches = len(data_generator)
    pred_score_total, pred_words_total = 0, 0
    gold_score_total, gold_words_total = 0, 0
    counter = count(1)

    if replace_unk_hack and is_source_file:
        if data_generator.batch_size != 1:
            raise ValueError('For now the replace_unk_hack only works with batch size 1')
        source_file = open(source_file_path, 'r')
        lines_iterator = (line for line in source_file)

    try:
        for idx, batch in enumerate(data_generator):
            pred_batch, gold_batch, pred_scores, gold_scores, attn, src = translator.translate(batch, dataset)
            if test_memory and idx == 10:
                return torch.cuda.max_memory_allocated()
            pred_score_total += sum(score[0] for score in pred_scores)
            pred_words_total += sum(len(x[0]) for x in pred_batch)
            if translate_options.tgt:
                gold_score_total += sum(gold_scores)
                gold_words_total += sum(len(x) for x in batch.tgt[1:])

            # z_batch: an iterator over the predictions, their scores,
            # the gold sentence, its score, and the source sentence for each
            # sentence in the batch. It has to be zip_longest instead of
            # plain-old zip because the gold_batch has length 0 if the target
            # is not included.
            z_batch = zip_longest(
                pred_batch, gold_batch,
                pred_scores, gold_scores,
                (sent.squeeze(1) for sent in src.split(1, dim=1)))

            for pred_sents, gold_sent, pred_score, gold_score, src_sent in z_batch:
                n_best_preds = [pred for pred in pred_sents[:translate_options.n_best]]
                if replace_unk_hack and is_source_file:
                    original_line = next(lines_iterator).split(' ')
                    for pred in n_best_preds:
                        for idx_tok, tok in enumerate(pred):
                            if tok == '<unk>':
                                _, maxIndex = attn[0][0][idx_tok].max(0)
                                pred[idx_tok] = original_line[maxIndex[0]]
                n_best_preds = [" ".join(pred) for pred in n_best_preds]
                strToWrite = '\n'.join(n_best_preds) + '\n'

                if out_file_path is None:
                    res += strToWrite
                else:
                    out_file.write(strToWrite)
                    out_file.flush()

                if translate_options.verbose:
                    sent_number = next(counter)
                    words = get_src_words(
                        src_sent, translator.fields["src"].vocab.itos)

                    os.write(1, bytes('\nSENT %d: %s\n' %
                                      (sent_number, words), 'UTF-8'))

                    best_pred = n_best_preds[0]
                    best_score = pred_score[0]
                    os.write(1, bytes('PRED %d: %s\n' %
                                      (sent_number, best_pred), 'UTF-8'))
                    print("PRED SCORE: %.4f" % best_score)

                    if translate_options.tgt:
                        tgt_sent = ' '.join(gold_sent)
                        os.write(1, bytes('GOLD %d: %s\n' %
                                          (sent_number, tgt_sent), 'UTF-8'))
                        print("GOLD SCORE: %.4f" % gold_score)

                    if len(n_best_preds) > 1:
                        print('\nBEST HYP:')
                        for score, sent in zip(pred_score, n_best_preds):
                            os.write(1, bytes("[%.4f] %s\n" % (score, sent), 'UTF-8'))

            if idx / total_num_batches * 100 >= next_percentage_to_show:
                if verbose:
                    print('Total completed: {:.2f}%'.format(idx / total_num_batches * 100))
                next_percentage_to_show += percentages_to_show

        #report_score('PRED', pred_score_total, pred_words_total)

        if translate_options.tgt:
            report_score('GOLD', gold_score_total, gold_words_total)

    except Exception as e:
        print('An error occurred in translating sentences: {}'.format(e))
    try:
        source_file.close()
    except:pass
    try:
        os.remove(temp_file_path)
    except:pass

    if out_file_path is None:
        return res
    else:
        out_file.close()
        return out_file_path