예제 #1
0
                                       max_seq_length=args.max_sequence_length,
                                       hidden_act="gelu",
                                       factory=neural_factory)

# instantiate necessary modules for the whole translation pipeline, namely
# data layers, BERT encoder, and MLM and NSP loss functions
mlm_log_softmax = nemo_nlp.TransformerLogSoftmaxNM(vocab_size=vocab_size,
                                                   d_model=args.d_model,
                                                   factory=neural_factory)
mlm_loss = nemo_nlp.MaskedLanguageModelingLossNM(factory=neural_factory)

nsp_log_softmax = nemo_nlp.SentenceClassificationLogSoftmaxNM(
    d_model=args.d_model, num_classes=2, factory=neural_factory)
nsp_loss = nemo_nlp.NextSentencePredictionLossNM(factory=neural_factory)

bert_loss = nemo_nlp.LossAggregatorNM(num_inputs=2, factory=neural_factory)

# tie weights of MLM softmax layer and embedding layer of the encoder
mlm_log_softmax.log_softmax.dense.weight = \
    bert_model.bert.embeddings.word_embeddings.weight

train_data_layer = nemo_nlp.BertPretrainingDataLayer(
    tokenizer=tokenizer,
    dataset=args.dataset_dir,
    name="train",
    sentence_indices_filename=args.train_sentence_indices_filename,
    max_seq_length=args.max_sequence_length,
    mask_probability=args.mask_probability,
    batch_size=args.batch_size,
    factory=neural_factory)
예제 #2
0
""" create necessary modules for the whole translation pipeline, namely
data layers, BERT encoder, and MLM and NSP loss functions
"""
mlm_classifier = nemo_nlp.TokenClassifier(args.d_model,
                                          num_classes=tokenizer.vocab_size,
                                          num_layers=1,
                                          log_softmax=True)
mlm_loss_fn = nemo_nlp.MaskedLanguageModelingLossNM()

nsp_classifier = nemo_nlp.SequenceClassifier(args.d_model,
                                             num_classes=2,
                                             num_layers=2,
                                             log_softmax=True)
nsp_loss_fn = nemo.backends.pytorch.common.CrossEntropyLoss()

bert_loss = nemo_nlp.LossAggregatorNM(num_inputs=2)

# tie weights of MLM softmax layer and embedding layer of the encoder
mlm_classifier.mlp.last_linear_layer.weight = \
    bert_model.bert.embeddings.word_embeddings.weight


def create_pipeline(data_file, max_seq_length, mask_probability,
                    short_seq_prob, batch_size):
    data_layer = nemo_nlp.BertPretrainingDataLayer(tokenizer,
                                                   data_file,
                                                   max_seq_length,
                                                   mask_probability,
                                                   short_seq_prob,
                                                   batch_size=batch_size)
    steps_per_epoch = len(data_layer) // (batch_size * args.num_gpus)
예제 #3
0
def create_pipeline(num_samples=-1,
                    pad_label=args.none_label,
                    max_seq_length=args.max_seq_length,
                    batch_size=args.batch_size,
                    local_rank=args.local_rank,
                    num_gpus=args.num_gpus,
                    mode='train',
                    punct_label_ids=None,
                    capit_label_ids=None,
                    ignore_extra_tokens=args.ignore_extra_tokens,
                    ignore_start_end=args.ignore_start_end,
                    use_cache=args.use_cache,
                    dropout=args.fc_dropout,
                    punct_num_layers=args.punct_num_fc_layers):

    global punct_classifier, punct_loss, \
        capit_classifier, capit_loss, task_loss

    nf.logger.info(f"Loading {mode} data...")
    shuffle = args.shuffle_data if mode == 'train' else False

    text_file = f'{args.data_dir}/text_{mode}.txt'
    label_file = f'{args.data_dir}/labels_{mode}.txt'

    if not (os.path.exists(text_file) or (os.path.exists(label_file))):
        raise FileNotFoundError(f'{text_file} or {label_file} not found. \
           The data should be splitted into 2 files: text.txt and labels.txt. \
           Each line of the text.txt file contains text sequences, where words\
           are separated with spaces. The labels.txt file contains \
           corresponding labels for each word in text.txt, the labels are \
           separated with spaces. Each line of the files should follow the \
           format:  \
           [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and \
           [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).')

    data_layer = nemo_nlp.BertPunctuationCapitalizationDataLayer(
        tokenizer=tokenizer,
        text_file=text_file,
        label_file=label_file,
        pad_label=pad_label,
        punct_label_ids=punct_label_ids,
        capit_label_ids=capit_label_ids,
        max_seq_length=max_seq_length,
        batch_size=batch_size,
        num_workers=0,
        local_rank=local_rank,
        shuffle=shuffle,
        ignore_extra_tokens=ignore_extra_tokens,
        ignore_start_end=ignore_start_end,
        use_cache=use_cache)

    input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, \
        punct_labels, capit_labels = data_layer()

    if mode == 'train':
        punct_label_ids = data_layer.dataset.punct_label_ids
        capit_label_ids = data_layer.dataset.capit_label_ids
        class_weights = None

        if args.use_weighted_loss_punct:
            nf.logger.info(f"Using weighted loss for punctuation task")
            punct_label_freqs = data_layer.dataset.punct_label_frequencies
            class_weights = utils.calc_class_weights(punct_label_freqs)

        # Initialize punctuation loss
        punct_classifier = getattr(sys.modules[__name__], punct_classifier)
        punct_classifier = punct_classifier(hidden_size=hidden_size,
                                            num_classes=len(punct_label_ids),
                                            dropout=dropout,
                                            num_layers=punct_num_layers,
                                            name='Punctuation')

        punct_loss = getattr(sys.modules[__name__], punct_loss)
        punct_loss = punct_loss(num_classes=len(punct_label_ids),
                                class_weights=class_weights)

        # Initialize capitalization loss
        capit_classifier = getattr(sys.modules[__name__], capit_classifier)
        capit_classifier = capit_classifier(hidden_size=hidden_size,
                                            num_classes=len(capit_label_ids),
                                            dropout=dropout,
                                            name='Capitalization')
        capit_loss = getattr(sys.modules[__name__], capit_loss)
        capit_loss = capit_loss(num_classes=len(capit_label_ids))

        task_loss = nemo_nlp.LossAggregatorNM(num_inputs=2)

    hidden_states = model(input_ids=input_ids,
                          token_type_ids=input_type_ids,
                          attention_mask=input_mask)

    punct_logits = punct_classifier(hidden_states=hidden_states)
    capit_logits = capit_classifier(hidden_states=hidden_states)

    if mode == 'train':
        punct_loss = punct_loss(logits=punct_logits,
                                labels=punct_labels,
                                loss_mask=loss_mask)
        capit_loss = capit_loss(logits=capit_logits,
                                labels=capit_labels,
                                loss_mask=loss_mask)
        task_loss = task_loss(loss_1=punct_loss, loss_2=capit_loss)

        steps_per_epoch = len(data_layer) // (batch_size * num_gpus)

        losses = [task_loss, punct_loss, capit_loss]
        logits = [punct_logits, capit_logits]
        return (losses, logits, steps_per_epoch, punct_label_ids,
                capit_label_ids)
    else:
        tensors_to_evaluate = [
            punct_logits, capit_logits, punct_labels, capit_labels,
            subtokens_mask
        ]
        return tensors_to_evaluate, data_layer