示例#1
0
def create_pipeline(data_file,
                    batch_size,
                    preprocessed_data=False,
                    batches_per_step=1,
                    **kwargs):

    if not preprocessed_data:
        max_seq_length, mask_probability, short_seq_prob =\
            kwargs['max_seq_length'], kwargs['mask_probability'],\
            kwargs['short_seq_prob']
        data_layer = nemo_nlp.BertPretrainingDataLayer(tokenizer,
                                                       data_file,
                                                       max_seq_length,
                                                       mask_probability,
                                                       short_seq_prob,
                                                       batch_size=batch_size)
    else:
        training, max_predictions_per_seq =\
            kwargs['training'], kwargs['max_predictions_per_seq']
        data_layer = nemo_nlp.BertPretrainingPreprocessedDataLayer(
            data_file,
            max_predictions_per_seq,
            batch_size=batch_size,
            training=training)

    steps_per_epoch = \
        math.ceil(len(data_layer) / (
            batch_size * args.num_gpus * batches_per_step))

    input_ids, input_type_ids, input_mask, \
        output_ids, output_mask, nsp_labels = data_layer()
    hidden_states = bert_model(input_ids=input_ids,
                               token_type_ids=input_type_ids,
                               attention_mask=input_mask)
    mlm_logits = mlm_classifier(hidden_states=hidden_states)
    mlm_loss = mlm_loss_fn(logits=mlm_logits,
                           output_ids=output_ids,
                           output_mask=output_mask)
    if not args.only_mlm_loss:
        nsp_logits = nsp_classifier(hidden_states=hidden_states)
        nsp_loss = nsp_loss_fn(logits=nsp_logits, labels=nsp_labels)
        loss = bert_loss(loss_1=mlm_loss, loss_2=nsp_loss)
    else:
        loss = mlm_loss
        nsp_loss = None
    return loss, mlm_loss, nsp_loss, steps_per_epoch
示例#2
0
def create_pipeline(data_file, max_seq_length, mask_probability, batch_size):
    data_layer = nemo_nlp.BertPretrainingDataLayer(tokenizer,
                                                   data_file,
                                                   max_seq_length,
                                                   mask_probability,
                                                   batch_size=batch_size)
    steps_per_epoch = len(data_layer) // (batch_size * args.num_gpus)

    input_ids, input_type_ids, input_mask, \
        output_ids, output_mask, nsp_labels = data_layer()
    hidden_states = bert_model(input_ids=input_ids,
                               token_type_ids=input_type_ids,
                               attention_mask=input_mask)
    mlm_logits = mlm_classifier(hidden_states=hidden_states)
    mlm_loss = mlm_loss_fn(logits=mlm_logits,
                           output_ids=output_ids,
                           output_mask=output_mask)
    nsp_logits = nsp_classifier(hidden_states=hidden_states)
    nsp_loss = nsp_loss_fn(logits=nsp_logits, labels=nsp_labels)

    loss = bert_loss(loss_1=mlm_loss, loss_2=nsp_loss)
    return loss, [mlm_loss, nsp_loss], steps_per_epoch
示例#3
0
nsp_log_softmax = nemo_nlp.SentenceClassificationLogSoftmaxNM(
    d_model=args.d_model, num_classes=2, factory=neural_factory)
nsp_loss = nemo_nlp.NextSentencePredictionLossNM(factory=neural_factory)

bert_loss = nemo_nlp.LossAggregatorNM(num_inputs=2, factory=neural_factory)

# tie weights of MLM softmax layer and embedding layer of the encoder
mlm_log_softmax.log_softmax.dense.weight = \
    bert_model.bert.embeddings.word_embeddings.weight

train_data_layer = nemo_nlp.BertPretrainingDataLayer(
    tokenizer=tokenizer,
    dataset=args.dataset_dir,
    name="train",
    sentence_indices_filename=args.train_sentence_indices_filename,
    max_seq_length=args.max_sequence_length,
    mask_probability=args.mask_probability,
    batch_size=args.batch_size,
    factory=neural_factory)

dev_data_layer = nemo_nlp.BertPretrainingDataLayer(
    tokenizer=tokenizer,
    dataset=args.dev_dataset_dir,
    name="dev",
    sentence_indices_filename=args.dev_sentence_indices_filename,
    max_seq_length=args.max_sequence_length,
    mask_probability=args.mask_probability,
    batch_size=args.eval_batch_size,
    factory=neural_factory)