num_classes=len(punct_label_ids),
                                   dropout=CLASSIFICATION_DROPOUT,
                                   num_layers=PUNCT_NUM_FC_LAYERS,
                                   name='Punctuation')

capit_classifier = TokenClassifier(hidden_size=bert_model.hidden_size,
                                   num_classes=len(capit_label_ids),
                                   dropout=CLASSIFICATION_DROPOUT,
                                   name='Capitalization')

# If you don't want to use weighted loss for Punctuation task, use class_weights=None
punct_label_freqs = train_data_layer.dataset.punct_label_frequencies
class_weights = calc_class_weights(punct_label_freqs)

# define loss
punct_loss = CrossEntropyLossNM(logits_ndim=3, weight=class_weights)
capit_loss = CrossEntropyLossNM(logits_ndim=3)
task_loss = LossAggregatorNM(num_inputs=2)

input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, punct_labels, capit_labels = train_data_layer(
)

hidden_states = bert_model(input_ids=input_ids,
                           token_type_ids=input_type_ids,
                           attention_mask=input_mask)

punct_logits = punct_classifier(hidden_states=hidden_states)
capit_logits = capit_classifier(hidden_states=hidden_states)

punct_loss = punct_loss(logits=punct_logits,
                        labels=punct_labels,
示例#2
0
def create_pipeline(
    pad_label=args.none_label,
    max_seq_length=args.max_seq_length,
    batch_size=args.batch_size,
    num_gpus=args.num_gpus,
    mode='train',
    punct_label_ids=None,
    capit_label_ids=None,
    ignore_extra_tokens=args.ignore_extra_tokens,
    ignore_start_end=args.ignore_start_end,
    overwrite_processed_files=args.overwrite_processed_files,
    dropout=args.fc_dropout,
    punct_num_layers=args.punct_num_fc_layers,
    capit_num_layers=args.capit_num_fc_layers,
    classifier=PunctCapitTokenClassifier,
):

    logging.info(f"Loading {mode} data...")
    shuffle = args.shuffle_data if mode == 'train' else False

    text_file = f'{args.data_dir}/text_{mode}.txt'
    label_file = f'{args.data_dir}/labels_{mode}.txt'

    if not (os.path.exists(text_file) or (os.path.exists(label_file))):
        raise FileNotFoundError(f'{text_file} or {label_file} not found. \
           The data should be splitted into 2 files: text.txt and labels.txt. \
           Each line of the text.txt file contains text sequences, where words\
           are separated with spaces. The labels.txt file contains \
           corresponding labels for each word in text.txt, the labels are \
           separated with spaces. Each line of the files should follow the \
           format:  \
           [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and \
           [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).')

    data_layer = PunctuationCapitalizationDataLayer(
        tokenizer=tokenizer,
        text_file=text_file,
        label_file=label_file,
        pad_label=pad_label,
        punct_label_ids=punct_label_ids,
        capit_label_ids=capit_label_ids,
        max_seq_length=max_seq_length,
        batch_size=batch_size,
        shuffle=shuffle,
        ignore_extra_tokens=ignore_extra_tokens,
        ignore_start_end=ignore_start_end,
        overwrite_processed_files=overwrite_processed_files,
        num_workers=args.num_workers,
        pin_memory=args.enable_pin_memory,
    )

    (input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask,
     punct_labels, capit_labels) = data_layer()

    if mode == 'train':
        punct_label_ids = data_layer.dataset.punct_label_ids
        capit_label_ids = data_layer.dataset.capit_label_ids
        class_weights = None

        if args.use_weighted_loss_punct:
            logging.info(f"Using weighted loss for punctuation task")
            punct_label_freqs = data_layer.dataset.punct_label_frequencies
            class_weights = calc_class_weights(punct_label_freqs)

        classifier = classifier(
            hidden_size=hidden_size,
            punct_num_classes=len(punct_label_ids),
            capit_num_classes=len(capit_label_ids),
            dropout=dropout,
            punct_num_layers=punct_num_layers,
            capit_num_layers=capit_num_layers,
        )

        punct_loss = CrossEntropyLossNM(logits_ndim=3, weight=class_weights)
        capit_loss = CrossEntropyLossNM(logits_ndim=3)
        task_loss = LossAggregatorNM(
            num_inputs=2,
            weights=[args.punct_loss_weight, 1.0 - args.punct_loss_weight])

    hidden_states = model(input_ids=input_ids,
                          token_type_ids=input_type_ids,
                          attention_mask=input_mask)

    punct_logits, capit_logits = classifier(hidden_states=hidden_states)

    if mode == 'train':
        punct_loss = punct_loss(logits=punct_logits,
                                labels=punct_labels,
                                loss_mask=loss_mask)
        capit_loss = capit_loss(logits=capit_logits,
                                labels=capit_labels,
                                loss_mask=loss_mask)
        task_loss = task_loss(loss_1=punct_loss, loss_2=capit_loss)

        steps_per_epoch = len(data_layer) // (batch_size * num_gpus)

        losses = [task_loss, punct_loss, capit_loss]
        logits = [punct_logits, capit_logits]
        return losses, logits, steps_per_epoch, punct_label_ids, capit_label_ids, classifier
    else:
        tensors_to_evaluate = [
            punct_logits, capit_logits, punct_labels, capit_labels,
            subtokens_mask
        ]
        return tensors_to_evaluate, data_layer
示例#3
0
def create_pipeline(
    pad_label=args.none_label,
    max_seq_length=args.max_seq_length,
    batch_size=args.batch_size,
    num_gpus=args.num_gpus,
    mode='train',
    batches_per_step=args.batches_per_step,
    label_ids=None,
    ignore_extra_tokens=args.ignore_extra_tokens,
    ignore_start_end=args.ignore_start_end,
    use_cache=args.use_cache,
    dropout=args.fc_dropout,
    num_layers=args.num_fc_layers,
    classifier=TokenClassifier,
):

    logging.info(f"Loading {mode} data...")
    shuffle = args.shuffle_data if mode == 'train' else False

    text_file = f'{args.data_dir}/text_{mode}.txt'
    label_file = f'{args.data_dir}/labels_{mode}.txt'

    if not (os.path.exists(text_file) or (os.path.exists(label_file))):
        raise FileNotFoundError(f'{text_file} or {label_file} not found. \
           The data should be splitted into 2 files: text.txt and labels.txt. \
           Each line of the text.txt file contains text sequences, where words\
           are separated with spaces. The labels.txt file contains \
           corresponding labels for each word in text.txt, the labels are \
           separated with spaces. Each line of the files should follow the \
           format:  \
           [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and \
           [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).')

    data_layer = BertTokenClassificationDataLayer(
        tokenizer=tokenizer,
        text_file=text_file,
        label_file=label_file,
        pad_label=pad_label,
        label_ids=label_ids,
        max_seq_length=max_seq_length,
        batch_size=batch_size,
        shuffle=shuffle,
        ignore_extra_tokens=ignore_extra_tokens,
        ignore_start_end=ignore_start_end,
        use_cache=use_cache,
    )

    (input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask,
     labels) = data_layer()

    if mode == 'train':
        label_ids = data_layer.dataset.label_ids
        class_weights = None

        if args.use_weighted_loss:
            logging.info(f"Using weighted loss")
            label_freqs = data_layer.dataset.label_frequencies
            class_weights = calc_class_weights(label_freqs)

        classifier = classifier(hidden_size=hidden_size,
                                num_classes=len(label_ids),
                                dropout=dropout,
                                num_layers=num_layers)

        task_loss = CrossEntropyLossNM(logits_ndim=3, weight=class_weights)

    hidden_states = model(input_ids=input_ids,
                          token_type_ids=input_type_ids,
                          attention_mask=input_mask)
    logits = classifier(hidden_states=hidden_states)

    if mode == 'train':
        loss = task_loss(logits=logits, labels=labels, loss_mask=loss_mask)
        steps_per_epoch = len(data_layer) // (batch_size * num_gpus *
                                              batches_per_step)
        tensors_to_evaluate = [loss, logits]
        return tensors_to_evaluate, loss, steps_per_epoch, label_ids, classifier
    else:
        tensors_to_evaluate = [logits, labels, subtokens_mask]
        return tensors_to_evaluate, data_layer
示例#4
0
vocab_size = len(data_desc.vocab)
encoder = EncoderRNN(vocab_size, args.emb_dim, args.hid_dim, args.dropout,
                     args.n_layers)

decoder = TRADEGenerator(
    data_desc.vocab,
    encoder.embedding,
    args.hid_dim,
    args.dropout,
    data_desc.slots,
    len(data_desc.gating_dict),
    teacher_forcing=args.teacher_forcing,
)

gate_loss_fn = CrossEntropyLossNM(logits_dim=3)
ptr_loss_fn = MaskedLogLoss()
total_loss_fn = LossAggregatorNM(num_inputs=2)


def create_pipeline(num_samples, batch_size, num_gpus, input_dropout,
                    data_prefix, is_training):
    logging.info(f"Loading {data_prefix} data...")
    shuffle = args.shuffle_data if is_training else False

    data_layer = MultiWOZDataLayer(
        abs_data_dir,
        data_desc.domains,
        all_domains=data_desc.all_domains,
        vocab=data_desc.vocab,
        slots=data_desc.slots,
示例#5
0
hidden_size = pretrained_bert_model.hidden_size

data_desc = JointIntentSlotDataDesc(data_dir=args.data_dir,
                                    none_slot_label=args.none_slot_label,
                                    pad_label=args.pad_label)

# Create sentence classification loss on top
classifier = JointIntentSlotClassifier(hidden_size=hidden_size,
                                       num_intents=data_desc.num_intents,
                                       num_slots=data_desc.num_slots,
                                       dropout=args.fc_dropout)

if args.class_balancing == 'weighted_loss':
    # To tackle imbalanced classes, you may use weighted loss
    intent_loss_fn = CrossEntropyLossNM(logits_ndim=2,
                                        weight=data_desc.intent_weights)
    slot_loss_fn = CrossEntropyLossNM(logits_ndim=3,
                                      weight=data_desc.slot_weights)
else:
    intent_loss_fn = CrossEntropyLossNM(logits_ndim=2)
    slot_loss_fn = CrossEntropyLossNM(logits_ndim=3)

total_loss_fn = LossAggregatorNM(
    num_inputs=2,
    weights=[args.intent_loss_weight, 1.0 - args.intent_loss_weight])


def create_pipeline(num_samples=-1,
                    batch_size=32,
                    data_prefix='train',
                    is_training=True,
示例#6
0
        vocab_size=output_vocab_size,
        attn_score_dropout=args.decoder_attn_score_dropout,
        max_seq_length=args.max_seq_length,
        embedding_dropout=args.decoder_embedding_dropout,
        hidden_act=args.decoder_hidden_act,
        use_full_attention=args.use_full_attention,
    )

    logits = nemo_nlp.nm.trainables.TokenClassifier(
        hidden_size,
        num_classes=output_vocab_size,
        num_layers=1,
        log_softmax=False,
        dropout=0.1)

    loss_fn = CrossEntropyLossNM(logits_ndim=3)
    loss_eval_metric = CrossEntropyLossNM(logits_ndim=3, reduction='none')

    if args.command == "infer":
        beam_search = nemo_nlp.nm.trainables.BeamSearchTranslatorNM(
            decoder=decoder,
            log_softmax=logits,
            max_seq_length=args.max_seq_length,
            beam_size=args.beam_size,
            length_penalty=args.length_penalty,
            bos_token=tokenizer.bos_id,
            pad_token=tokenizer.pad_id,
            eos_token=tokenizer.eos_id,
        )

    # tie all embeddings weights
示例#7
0
def test_simple_vc_trainer():
    # Train a sample model with test data

    # Create neural factory
    model_dir = os.path.join(get_data_folder(), ".test_model")
    nf = nemo.core.NeuralModuleFactory(
        placement=nemo.core.neural_factory.DeviceType.GPU,
        checkpoint_dir=model_dir)

    # Generate dataset
    bam = os.path.join(get_data_folder(), "small_bam.bam")
    labels = os.path.join(get_data_folder(), "candidates.vcf.gz")
    vcf_loader = VCFReader(vcf=labels, bams=[bam], is_fp=False)

    # Neural Network
    alexnet = AlexNet(num_input_channels=1, num_output_logits=3)

    # Create train DAG
    dataset_train = ReadPileupDataLoader(ReadPileupDataLoader.Type.TRAIN,
                                         [vcf_loader],
                                         batch_size=32,
                                         shuffle=True)
    vz_ce_loss = CrossEntropyLossNM(logits_ndim=2)
    vz_labels, encoding = dataset_train()
    vz = alexnet(encoding=encoding)
    vz_loss = vz_ce_loss(logits=vz, labels=vz_labels)

    # Create evaluation DAG using same dataset as training
    dataset_eval = ReadPileupDataLoader(ReadPileupDataLoader.Type.EVAL,
                                        [vcf_loader],
                                        batch_size=32,
                                        shuffle=False)
    vz_ce_loss_eval = CrossEntropyLossNM(logits_ndim=2)
    vz_labels_eval, encoding_eval = dataset_eval()
    vz_eval = alexnet(encoding=encoding_eval)
    vz_loss_eval = vz_ce_loss_eval(logits=vz_eval, labels=vz_labels_eval)

    # Logger callback
    logger_callback = nemo.core.SimpleLossLoggerCallback(
        tensors=[vz_loss, vz, vz_labels],
        step_freq=1,
    )

    evaluator_callback = nemo.core.EvaluatorCallback(
        eval_tensors=[vz_loss_eval, vz_eval, vz_labels_eval],
        user_iter_callback=eval_iter_callback,
        user_epochs_done_callback=eval_epochs_done_callback,
        eval_step=1,
    )

    # Checkpointing models through NeMo callback
    checkpoint_callback = nemo.core.CheckpointCallback(
        folder=nf.checkpoint_dir,
        load_from_folder=None,
        # Checkpointing frequency in steps
        step_freq=-1,
        # Checkpointing frequency in epochs
        epoch_freq=1,
        # Number of checkpoints to keep
        checkpoints_to_keep=1,
        # If True, CheckpointCallback will raise an Error if restoring fails
        force_load=False)

    # Invoke the "train" action.
    nf.train(
        [vz_loss],
        callbacks=[logger_callback, checkpoint_callback, evaluator_callback],
        optimization_params={
            "num_epochs": 1,
            "lr": 0.001
        },
        optimizer="adam")

    assert (os.path.exists(os.path.join(model_dir, "AlexNet-EPOCH-1.pt")))
示例#8
0
def train(args):
    """Train a sample model with test data."""
    # Create neural factory as per NeMo requirements.
    nf = nemo.core.NeuralModuleFactory(
        placement=nemo.core.neural_factory.DeviceType.GPU)

    model = create_model()

    # Create train DAG
    train_dataset = HDFDataLoader(args.train_hdf,
                                  batch_size=32,
                                  shuffle=True,
                                  num_workers=args.threads,
                                  tensor_keys=["encodings", "labels"],
                                  tensor_dims=[('B', 'C', 'H', 'W'),
                                               tuple('B')],
                                  tensor_neural_types=[
                                      ReadPileupNeuralType(),
                                      VariantZygosityNeuralType()
                                  ])
    vz_ce_loss = CrossEntropyLossNM(logits_ndim=2)
    encoding, vz_labels = train_dataset()
    vz = model(encoding=encoding)
    vz_loss = vz_ce_loss(logits=vz, labels=vz_labels)

    callbacks = []

    # Logger callback
    loggercallback = nemo.core.SimpleLossLoggerCallback(
        tensors=[vz_loss],
        step_freq=5,
        print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}'),
    )
    callbacks.append(loggercallback)

    # Checkpointing models through NeMo callback
    checkpointcallback = nemo.core.CheckpointCallback(
        folder=args.model_dir,
        load_from_folder=None,
        # Checkpointing frequency in steps
        step_freq=-1,
        # Checkpointing frequency in epochs
        epoch_freq=1,
        # Number of checkpoints to keep
        checkpoints_to_keep=1,
        # If True, CheckpointCallback will raise an Error if restoring fails
        force_load=False)
    callbacks.append(checkpointcallback)

    # Create eval DAG if eval files are available
    if args.eval_hdf:
        eval_dataset = HDFDataLoader(args.eval_hdf,
                                     batch_size=32,
                                     shuffle=False,
                                     num_workers=args.threads,
                                     tensor_keys=["encodings", "labels"],
                                     tensor_dims=[('B', 'C', 'H', 'W'),
                                                  tuple('B')],
                                     tensor_neural_types=[
                                         ReadPileupNeuralType(),
                                         VariantZygosityNeuralType()
                                     ])
        eval_vz_ce_loss = CrossEntropyLossNM(logits_ndim=2)
        eval_encoding, eval_vz_labels = eval_dataset()
        eval_vz = model(encoding=eval_encoding)
        eval_vz_loss = eval_vz_ce_loss(logits=eval_vz, labels=eval_vz_labels)

        # Add evaluation callback
        evaluator_callback = nemo.core.EvaluatorCallback(
            eval_tensors=[eval_vz_loss, eval_vz, eval_vz_labels],
            user_iter_callback=eval_iter_callback,
            user_epochs_done_callback=eval_epochs_done_callback,
            eval_step=100,
            eval_epoch=1,
            eval_at_start=False,
        )
        callbacks.append(evaluator_callback)

    # Invoke the "train" action.
    nf.train([vz_loss],
             callbacks=callbacks,
             optimization_params={
                 "num_epochs": args.epochs,
                 "lr": 0.001
             },
             optimizer="adam")