def create_pipeline(num_samples=-1, pad_label=args.none_label, max_seq_length=args.max_seq_length, batch_size=args.batch_size, local_rank=args.local_rank, num_gpus=args.num_gpus, mode='train', ignore_extra_tokens=args.ignore_extra_tokens, ignore_start_end=args.ignore_start_end, use_cache=args.use_cache): nf.logger.info(f"Loading {mode} data...") shuffle = args.shuffle_data if mode == 'train' else False text_file = f'{args.data_dir}/text_{mode}.txt' label_file = f'{args.data_dir}/labels_{mode}.txt' if not (os.path.exists(text_file) or (os.path.exists(label_file))): raise FileNotFoundError(f'{text_file} or {label_file} not found. \ The data should be splitted into 2 files: text.txt and labels.txt. \ Each line of the text.txt file contains text sequences, where words\ are separated with spaces. The labels.txt file contains \ corresponding labels for each word in text.txt, the labels are \ separated with spaces. Each line of the files should follow the \ format: \ [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and \ [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).') data_layer = nemo_nlp.BertTokenClassificationDataLayer( tokenizer=tokenizer, text_file=text_file, label_file=label_file, pad_label=pad_label, max_seq_length=max_seq_length, batch_size=batch_size, num_workers=0, local_rank=local_rank, shuffle=shuffle, ignore_extra_tokens=ignore_extra_tokens, ignore_start_end=ignore_start_end, use_cache=use_cache) label_ids = data_layer.dataset.label_ids input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, \ labels = data_layer() hidden_states = bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) logits = classifier(hidden_states=hidden_states) loss = punct_loss(logits=logits, labels=labels, loss_mask=loss_mask) steps_per_epoch = len(data_layer) // (batch_size * num_gpus) if mode == 'train': tensors_to_evaluate = [loss, logits] else: tensors_to_evaluate = [logits, labels, subtokens_mask] return tensors_to_evaluate, loss, steps_per_epoch, label_ids, data_layer
def create_pipeline(input_file, max_seq_length=args.max_seq_length, batch_size=args.batch_size, local_rank=args.local_rank, num_gpus=args.num_gpus): data_layer = nemo_nlp.BertTokenClassificationDataLayer( tokenizer=tokenizer, input_file=input_file, max_seq_length=max_seq_length, dataset_type=args.dataset_type, batch_size=batch_size, num_workers=0, local_rank=local_rank) tag_ids = data_layer.dataset.tag_ids input_ids, input_type_ids, input_mask, labels, seq_ids = data_layer() hidden_states = pretrained_bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) logits = ner_classifier(hidden_states=hidden_states) loss = ner_loss(logits=logits, labels=labels, input_mask=input_mask) steps_per_epoch = len(data_layer) // (batch_size * num_gpus) return loss, steps_per_epoch, tag_ids, data_layer, [logits, seq_ids]