Пример #1
0
def create_pipeline(num_samples=-1,
                    pad_label=args.none_label,
                    max_seq_length=args.max_seq_length,
                    batch_size=args.batch_size,
                    local_rank=args.local_rank,
                    num_gpus=args.num_gpus,
                    mode='train',
                    ignore_extra_tokens=args.ignore_extra_tokens,
                    ignore_start_end=args.ignore_start_end,
                    use_cache=args.use_cache):

    nf.logger.info(f"Loading {mode} data...")
    shuffle = args.shuffle_data if mode == 'train' else False

    text_file = f'{args.data_dir}/text_{mode}.txt'
    label_file = f'{args.data_dir}/labels_{mode}.txt'

    if not (os.path.exists(text_file) or (os.path.exists(label_file))):
        raise FileNotFoundError(f'{text_file} or {label_file} not found. \
           The data should be splitted into 2 files: text.txt and labels.txt. \
           Each line of the text.txt file contains text sequences, where words\
           are separated with spaces. The labels.txt file contains \
           corresponding labels for each word in text.txt, the labels are \
           separated with spaces. Each line of the files should follow the \
           format:  \
           [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and \
           [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).')

    data_layer = nemo_nlp.BertTokenClassificationDataLayer(
        tokenizer=tokenizer,
        text_file=text_file,
        label_file=label_file,
        pad_label=pad_label,
        max_seq_length=max_seq_length,
        batch_size=batch_size,
        num_workers=0,
        local_rank=local_rank,
        shuffle=shuffle,
        ignore_extra_tokens=ignore_extra_tokens,
        ignore_start_end=ignore_start_end,
        use_cache=use_cache)

    label_ids = data_layer.dataset.label_ids
    input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, \
        labels = data_layer()
    hidden_states = bert_model(input_ids=input_ids,
                               token_type_ids=input_type_ids,
                               attention_mask=input_mask)

    logits = classifier(hidden_states=hidden_states)
    loss = punct_loss(logits=logits, labels=labels, loss_mask=loss_mask)
    steps_per_epoch = len(data_layer) // (batch_size * num_gpus)

    if mode == 'train':
        tensors_to_evaluate = [loss, logits]
    else:
        tensors_to_evaluate = [logits, labels, subtokens_mask]
    return tensors_to_evaluate, loss, steps_per_epoch, label_ids, data_layer
Пример #2
0
def create_pipeline(input_file,
                    max_seq_length=args.max_seq_length,
                    batch_size=args.batch_size,
                    local_rank=args.local_rank,
                    num_gpus=args.num_gpus):
    data_layer = nemo_nlp.BertTokenClassificationDataLayer(
        tokenizer=tokenizer,
        input_file=input_file,
        max_seq_length=max_seq_length,
        dataset_type=args.dataset_type,
        batch_size=batch_size,
        num_workers=0,
        local_rank=local_rank)
    tag_ids = data_layer.dataset.tag_ids
    input_ids, input_type_ids, input_mask, labels, seq_ids = data_layer()
    hidden_states = pretrained_bert_model(input_ids=input_ids,
                                          token_type_ids=input_type_ids,
                                          attention_mask=input_mask)
    logits = ner_classifier(hidden_states=hidden_states)
    loss = ner_loss(logits=logits, labels=labels, input_mask=input_mask)
    steps_per_epoch = len(data_layer) // (batch_size * num_gpus)
    return loss, steps_per_epoch, tag_ids, data_layer, [logits, seq_ids]