max_seq_length=args.max_sequence_length, hidden_act="gelu", factory=neural_factory) # instantiate necessary modules for the whole translation pipeline, namely # data layers, BERT encoder, and MLM and NSP loss functions mlm_log_softmax = nemo_nlp.TransformerLogSoftmaxNM(vocab_size=vocab_size, d_model=args.d_model, factory=neural_factory) mlm_loss = nemo_nlp.MaskedLanguageModelingLossNM(factory=neural_factory) nsp_log_softmax = nemo_nlp.SentenceClassificationLogSoftmaxNM( d_model=args.d_model, num_classes=2, factory=neural_factory) nsp_loss = nemo_nlp.NextSentencePredictionLossNM(factory=neural_factory) bert_loss = nemo_nlp.LossAggregatorNM(num_inputs=2, factory=neural_factory) # tie weights of MLM softmax layer and embedding layer of the encoder mlm_log_softmax.log_softmax.dense.weight = \ bert_model.bert.embeddings.word_embeddings.weight train_data_layer = nemo_nlp.BertPretrainingDataLayer( tokenizer=tokenizer, dataset=args.dataset_dir, name="train", sentence_indices_filename=args.train_sentence_indices_filename, max_seq_length=args.max_sequence_length, mask_probability=args.mask_probability, batch_size=args.batch_size, factory=neural_factory)
""" create necessary modules for the whole translation pipeline, namely data layers, BERT encoder, and MLM and NSP loss functions """ mlm_classifier = nemo_nlp.TokenClassifier(args.d_model, num_classes=tokenizer.vocab_size, num_layers=1, log_softmax=True) mlm_loss_fn = nemo_nlp.MaskedLanguageModelingLossNM() nsp_classifier = nemo_nlp.SequenceClassifier(args.d_model, num_classes=2, num_layers=2, log_softmax=True) nsp_loss_fn = nemo.backends.pytorch.common.CrossEntropyLoss() bert_loss = nemo_nlp.LossAggregatorNM(num_inputs=2) # tie weights of MLM softmax layer and embedding layer of the encoder mlm_classifier.mlp.last_linear_layer.weight = \ bert_model.bert.embeddings.word_embeddings.weight def create_pipeline(data_file, max_seq_length, mask_probability, short_seq_prob, batch_size): data_layer = nemo_nlp.BertPretrainingDataLayer(tokenizer, data_file, max_seq_length, mask_probability, short_seq_prob, batch_size=batch_size) steps_per_epoch = len(data_layer) // (batch_size * args.num_gpus)
def create_pipeline(num_samples=-1, pad_label=args.none_label, max_seq_length=args.max_seq_length, batch_size=args.batch_size, local_rank=args.local_rank, num_gpus=args.num_gpus, mode='train', punct_label_ids=None, capit_label_ids=None, ignore_extra_tokens=args.ignore_extra_tokens, ignore_start_end=args.ignore_start_end, use_cache=args.use_cache, dropout=args.fc_dropout, punct_num_layers=args.punct_num_fc_layers): global punct_classifier, punct_loss, \ capit_classifier, capit_loss, task_loss nf.logger.info(f"Loading {mode} data...") shuffle = args.shuffle_data if mode == 'train' else False text_file = f'{args.data_dir}/text_{mode}.txt' label_file = f'{args.data_dir}/labels_{mode}.txt' if not (os.path.exists(text_file) or (os.path.exists(label_file))): raise FileNotFoundError(f'{text_file} or {label_file} not found. \ The data should be splitted into 2 files: text.txt and labels.txt. \ Each line of the text.txt file contains text sequences, where words\ are separated with spaces. The labels.txt file contains \ corresponding labels for each word in text.txt, the labels are \ separated with spaces. Each line of the files should follow the \ format: \ [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and \ [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).') data_layer = nemo_nlp.BertPunctuationCapitalizationDataLayer( tokenizer=tokenizer, text_file=text_file, label_file=label_file, pad_label=pad_label, punct_label_ids=punct_label_ids, capit_label_ids=capit_label_ids, max_seq_length=max_seq_length, batch_size=batch_size, num_workers=0, local_rank=local_rank, shuffle=shuffle, ignore_extra_tokens=ignore_extra_tokens, ignore_start_end=ignore_start_end, use_cache=use_cache) input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, \ punct_labels, capit_labels = data_layer() if mode == 'train': punct_label_ids = data_layer.dataset.punct_label_ids capit_label_ids = data_layer.dataset.capit_label_ids class_weights = None if args.use_weighted_loss_punct: nf.logger.info(f"Using weighted loss for punctuation task") punct_label_freqs = data_layer.dataset.punct_label_frequencies class_weights = utils.calc_class_weights(punct_label_freqs) # Initialize punctuation loss punct_classifier = getattr(sys.modules[__name__], punct_classifier) punct_classifier = punct_classifier(hidden_size=hidden_size, num_classes=len(punct_label_ids), dropout=dropout, num_layers=punct_num_layers, name='Punctuation') punct_loss = getattr(sys.modules[__name__], punct_loss) punct_loss = punct_loss(num_classes=len(punct_label_ids), class_weights=class_weights) # Initialize capitalization loss capit_classifier = getattr(sys.modules[__name__], capit_classifier) capit_classifier = capit_classifier(hidden_size=hidden_size, num_classes=len(capit_label_ids), dropout=dropout, name='Capitalization') capit_loss = getattr(sys.modules[__name__], capit_loss) capit_loss = capit_loss(num_classes=len(capit_label_ids)) task_loss = nemo_nlp.LossAggregatorNM(num_inputs=2) hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) punct_logits = punct_classifier(hidden_states=hidden_states) capit_logits = capit_classifier(hidden_states=hidden_states) if mode == 'train': punct_loss = punct_loss(logits=punct_logits, labels=punct_labels, loss_mask=loss_mask) capit_loss = capit_loss(logits=capit_logits, labels=capit_labels, loss_mask=loss_mask) task_loss = task_loss(loss_1=punct_loss, loss_2=capit_loss) steps_per_epoch = len(data_layer) // (batch_size * num_gpus) losses = [task_loss, punct_loss, capit_loss] logits = [punct_logits, capit_logits] return (losses, logits, steps_per_epoch, punct_label_ids, capit_label_ids) else: tensors_to_evaluate = [ punct_logits, capit_logits, punct_labels, capit_labels, subtokens_mask ] return tensors_to_evaluate, data_layer