pretrained_model_name=PRETRAINED_BERT_MODEL)

train_data_layer = nemo_nlp.nm.data_layers.PunctuationCapitalizationDataLayer(
    tokenizer=tokenizer,
    text_file=os.path.join(DATA_DIR, 'text_train.txt'),
    label_file=os.path.join(DATA_DIR, 'labels_train.txt'),
    max_seq_length=MAX_SEQ_LENGTH,
    batch_size=BATCH_SIZE)

punct_label_ids = train_data_layer.dataset.punct_label_ids
capit_label_ids = train_data_layer.dataset.capit_label_ids

# Define classifier for Punctuation and Capitalization tasks
punct_classifier = TokenClassifier(hidden_size=bert_model.hidden_size,
                                   num_classes=len(punct_label_ids),
                                   dropout=CLASSIFICATION_DROPOUT,
                                   num_layers=PUNCT_NUM_FC_LAYERS,
                                   name='Punctuation')

capit_classifier = TokenClassifier(hidden_size=bert_model.hidden_size,
                                   num_classes=len(capit_label_ids),
                                   dropout=CLASSIFICATION_DROPOUT,
                                   name='Capitalization')

# If you don't want to use weighted loss for Punctuation task, use class_weights=None
punct_label_freqs = train_data_layer.dataset.punct_label_frequencies
class_weights = calc_class_weights(punct_label_freqs)

# define loss
punct_loss = CrossEntropyLossNM(logits_ndim=3, weight=class_weights)
capit_loss = CrossEntropyLossNM(logits_ndim=3)
Exemplo n.º 2
0
See the list of pretrained models, call:
nemo_nlp.huggingface.BERT.list_pretrained_models()
"""
pretrained_bert_model = nemo_nlp.nm.trainables.huggingface.BERT(
    pretrained_model_name=args.pretrained_bert_model)
hidden_size = pretrained_bert_model.hidden_size
tokenizer = NemoBertTokenizer(args.pretrained_bert_model)

data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationInferDataLayer(
    queries=args.queries,
    tokenizer=tokenizer,
    max_seq_length=args.max_seq_length,
    batch_size=1)

classifier = TokenClassifier(hidden_size=hidden_size,
                             num_classes=len(labels_dict),
                             dropout=args.fc_dropout)

input_ids, input_type_ids, input_mask, _, subtokens_mask = data_layer()

hidden_states = pretrained_bert_model(input_ids=input_ids,
                                      token_type_ids=input_type_ids,
                                      attention_mask=input_mask)
logits = classifier(hidden_states=hidden_states)

###########################################################################

# Instantiate an optimizer to perform `infer` action
evaluated_tensors = nf.infer(tensors=[logits, subtokens_mask],
                             checkpoint_dir=args.work_dir)
Exemplo n.º 3
0
    pretrained_model_name=args.pretrained_model_name)

tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer(
    tokenizer_name=args.tokenizer,
    pretrained_model_name=args.pretrained_model_name,
    tokenizer_model=args.tokenizer_model,
)
hidden_size = pretrained_bert_model.hidden_size

data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationInferDataLayer(
    queries=args.queries,
    tokenizer=tokenizer,
    max_seq_length=args.max_seq_length,
    batch_size=1)

classifier = TokenClassifier(hidden_size=hidden_size,
                             num_classes=len(labels_dict))

input_ids, input_type_ids, input_mask, _, subtokens_mask = data_layer()

hidden_states = pretrained_bert_model(input_ids=input_ids,
                                      token_type_ids=input_type_ids,
                                      attention_mask=input_mask)
logits = classifier(hidden_states=hidden_states)

###########################################################################

# Instantiate an optimizer to perform `infer` action
evaluated_tensors = nf.infer(tensors=[logits, subtokens_mask],
                             checkpoint_dir=args.checkpoint_dir)

Exemplo n.º 4
0
def label_notes(all_notes_lines):
    #    nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch, log_dir=None)
    #note_line_queries = notes.split('\n')
    #note_line_queries = ['pt arrived obtunded not answering questions responding to voice and sternal rub speaking in garbled voice pupils unequal left 3mm and right 2mm brisk bilaterally trauma sicu MD aware currently recieving keppra IV finished dilantin gtt due for level at 08a EEG today LSCTA on 3LNC sats 100 % SBP 90 s to 100 s HR NSR no ectopy 60 s NS @ 75cc continuous +BS no stools rec d lactulose at OSH to recieve PR q4h abd soft non-tender non-distended foley in place draining adequate amt clear yellow urine skin intact left 20G x2 WNL wife Name  NI']

    #    labels_dict = get_vocab(LABELS_DICT)
    """ Load the pretrained BERT parameters
    See the list of pretrained models, call:
    nemo_nlp.huggingface.BERT.list_pretrained_models()
    """
    #    pretrained_bert_model = nemo_nlp.nm.trainables.get_huggingface_model(
    #        bert_config=BERT_CONFIG, pretrained_model_name=PRETRAINED_MODEL_NAME
    #    )

    #    tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer(
    #        tokenizer_name=TOKENIZER,
    #        pretrained_model_name=PRETRAINED_MODEL_NAME,
    #        tokenizer_model=TOKENIZER_MODEL,
    #    )
    #    hidden_size = pretrained_bert_model.hidden_size

    load_datalayer_begin_time = time.time()
    data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationInferDataLayer(
        queries=all_notes_lines,
        tokenizer=tokenizer,
        max_seq_length=MAX_SEQ_LENGTH,
        batch_size=2000)
    load_datalayer_end_time = time.time()

    classifier = TokenClassifier(hidden_size=hidden_size,
                                 num_classes=len(labels_dict))

    input_ids, input_type_ids, input_mask, _, subtokens_mask = data_layer()

    load_hidden_states_begin_time = time.time()
    hidden_states = pretrained_bert_model(input_ids=input_ids,
                                          token_type_ids=input_type_ids,
                                          attention_mask=input_mask)
    load_hidden_states_end_time = time.time()
    load_logits_begin_time = time.time()
    logits = classifier(hidden_states=hidden_states)
    load_logits_end_time = time.time()

    ###########################################################################

    # Instantiate an optimizer to perform `infer` action
    infer_begin_time = time.time()
    evaluated_tensors = nf.infer(tensors=[logits, subtokens_mask],
                                 checkpoint_dir=CHECKPOINT_DIR)
    infer_end_time = time.time()

    logits, subtokens_mask = [
        concatenate(tensors) for tensors in evaluated_tensors
    ]

    preds = np.argmax(logits, axis=2)
    all_notes_labeled_lines = []

    for i, query in enumerate(all_notes_lines):
        logging.info(f'Query: {query}')

        pred = preds[i][subtokens_mask[i] > 0.5]
        words = query.strip().split()

        #replaced with logic below instead of raising an error:
        '''
        if len(pred) != len(words):
            logging.info('Preds length: ' + str(len(preds[i])))
            logging.info('subtokens_mask length: ' + str(len(subtokens_mask[i])))
            logging.info('Pred length: ' + str(len(pred)))
            logging.info('words length: ' + str(len(words)))
            logging.info('Preds: ' + str(preds.tolist()))
            logging.info('subtokens_mask: ' + str(subtokens_mask[i]))
            logging.info('Pred:' + str(pred.tolist()))
            logging.info('words:' + str(words))

            labeled_note = '__Prediction/Word Mismatch__ pred length: ' + str(len(pred)) + ', words length: ' + str(len(words))
            break
            #raise ValueError('Pred and words must be of the same length')
        
        output = ''
        for j, w in enumerate(words):
            output += w
            label = labels_dict[pred[j]]
            if label != NONE_LABEL:
                label = add_brackets(label)
                output += label
            output += ' '
        labeled_note += '\n' + output.strip()
        logging.info(f'Combined: {output.strip()}')

        '''

        if len(pred) == len(words):
            output = ''
            for j, w in enumerate(words):
                output += w
                label = labels_dict[pred[j]]
                if label != NONE_LABEL:
                    label = add_brackets(label)
                    output += label
                output += ' '
            all_notes_labeled_lines.append(output.strip())
            logging.info(f'Combined: {output.strip()}')
        else:
            all_notes_labeled_lines.append(query)
            pred_length = str(len(pred))
            word_length = str(len(words))
            logging.info(
                f'__Prediction/Word Length Mismatch__ pred length: {pred_length}, words length: {word_length}'
            )
            logging.info(f'{query}')

    print(
        str(load_datalayer_end_time - load_datalayer_begin_time) +
        ' seconds to load the datalayer')
    print(
        str(load_hidden_states_end_time - load_hidden_states_begin_time) +
        ' seconds to load hidden states')
    print(
        str(load_logits_end_time - load_logits_begin_time) +
        ' seconds to load logits')
    print(str(infer_end_time - infer_begin_time) + ' seconds to run inference')

    return all_notes_labeled_lines