示例#1
0
import torch

from collections import OrderedDict
from functools import partial
from torch.autograd import Variable
from torch.nn import CrossEntropyLoss, Module
from torch.optim import SGD

from .utils import add_metrics_to_log, get_loader, log_to_message, ProgressBar

DEFAULT_LOSS = CrossEntropyLoss()
DEFAULT_OPTIMIZER = partial(SGD, lr=0.001, momentum=0.9)


class FitModule(Module):
    def fit(self,
            X,
            y,
            batch_size=32,
            epochs=1,
            verbose=1,
            validation_split=0.,
            validation_data=None,
            shuffle=True,
            initial_epoch=0,
            seed=None,
            loss=DEFAULT_LOSS,
            optimizer=DEFAULT_OPTIMIZER,
            metrics=None):
        """Trains the model similar to Keras' .fit(...) method
    def forward(
        self,
        input_ids=None,
        past=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_tuple=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for language modeling.
            Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
            All labels set to ``-100`` are ignored (masked), the loss is only
            computed for labels in ``[0, ..., config.vocab_size]``
        """
        return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple

        transformer_outputs = self.transformer(
            input_ids,
            past=past,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_tuple=return_tuple,
        )

        hidden_states = transformer_outputs[0]

        lm_logits = self.lm_head(hidden_states)

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        if return_tuple:
            output = (lm_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return CausalLMOutputWithPast(
            loss=loss,
            logits=lm_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )
        self.linear_layers = Sequential(Linear(196, 10))

    # Defining the forward pass
    def forward(self, x):
        x = self.cnn_layers(x.float()).float()
        x = x.view(x.size(0), -1)
        x = self.linear_layers(x)
        return x


model = Net()
# defining the optimizer
optimizer = Adam(model.parameters(), lr=0.07)
# defining the loss function
criterion = CrossEntropyLoss()
# checking if GPU is available
if False:  #torch.cuda.is_available():
    model = model.cuda()
    criterion = criterion.cuda()

print(model)


def train(epoch):
    model.train()
    tr_loss = 0
    # getting the training set
    x_train, y_train = Variable(train_x), Variable(train_y)
    # getting the validation set
    x_val, y_val = Variable(val_x), Variable(val_y)
    def forward(
        self,
        input_ids=None,
        token_type_ids=None,
        attention_mask=None,
        labels=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the multiple choice classification loss.
            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
            of the input tensors. (see `input_ids` above)
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[
            1] if input_ids is not None else inputs_embeds.shape[1]

        flat_input_ids = input_ids.view(
            -1, input_ids.size(-1)) if input_ids is not None else None
        flat_position_ids = position_ids.view(
            -1, position_ids.size(-1)) if position_ids is not None else None
        flat_token_type_ids = token_type_ids.view(
            -1,
            token_type_ids.size(-1)) if token_type_ids is not None else None
        flat_attention_mask = attention_mask.view(
            -1,
            attention_mask.size(-1)) if attention_mask is not None else None
        flat_inputs_embeds = (inputs_embeds.view(-1, inputs_embeds.size(-2),
                                                 inputs_embeds.size(-1))
                              if inputs_embeds is not None else None)

        outputs = self.roberta(
            flat_input_ids,
            position_ids=flat_position_ids,
            token_type_ids=flat_token_type_ids,
            attention_mask=flat_attention_mask,
            head_mask=head_mask,
            inputs_embeds=flat_inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        reshaped_logits = logits.view(-1, num_choices)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        if not return_dict:
            output = (reshaped_logits, ) + outputs[2:]
            return ((loss, ) + output) if loss is not None else output

        return MultipleChoiceModelOutput(
            loss=loss,
            logits=reshaped_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    # Model
    logging.info('==> Building model..')
    #currently fold only for resnet18 for HW implementation
    if args.fold:
        assert (args.model == 'resnet18')
        args.model += 'a'

    modelClass = Models.__dict__[args.model]
    model = modelClass(args)

    # Load preTrained weights.
    logging.info('==> Resuming from checkpoint..')
    model.loadPreTrained()
    model = model.cuda()

    criterion = CrossEntropyLoss().cuda()

    run = Run(model, logging, criterion)

    # log command line
    logging.info('CommandLine: {} PID: {} '
                 'Hostname: {} CUDA_VISIBLE_DEVICES {}'.format(
                     argv, getpid(), gethostname(),
                     environ.get('CUDA_VISIBLE_DEVICES')))

    # Weights quantization
    if args.weightBitwidth < 32 and not args.fold:
        model_path = './qmodels'
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        model_path = os.path.join(
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        start_positions=None,
        end_positions=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.
        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`).
            Position outside of the sequence are not taken into account for computing the loss.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        total_loss = None
        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, split add a dimension
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            start_positions.clamp_(0, ignored_index)
            end_positions.clamp_(0, ignored_index)

            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2

        if not return_dict:
            output = (start_logits, end_logits) + outputs[2:]
            return ((total_loss, ) +
                    output) if total_loss is not None else output

        return QuestionAnsweringModelOutput(
            loss=total_loss,
            start_logits=start_logits,
            end_logits=end_logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    def forward(self,
                input_ids=None,
                attention_mask=None,
                token_type_ids=None,
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
                labels=None,
                output_attentions=None,
                output_hidden_states=None,
                return_dict=None,
                **kwargs):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for computing the masked language modeling loss.
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
            in ``[0, ..., config.vocab_size]``
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
                FutureWarning,
            )
            labels = kwargs.pop("masked_lm_labels")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]
        prediction_scores = self.lm_head(sequence_output)

        masked_lm_loss = None
        if labels is not None:
            #loss_fct = CrossEntropyLoss()
            loss_fct = CrossEntropyLoss(ignore_index=-1)
            masked_lm_loss = loss_fct(
                prediction_scores.view(-1, self.config.vocab_size),
                labels.view(-1))

        if not return_dict:
            output = (prediction_scores, ) + outputs[2:]
            return ((masked_lm_loss, ) +
                    output) if masked_lm_loss is not None else output

        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
示例#8
0
    def forward(
        self,
        input_values,
        attention_mask=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        labels=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states

        outputs = self.sew(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        if self.config.use_weighted_layer_sum:
            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
            hidden_states = torch.stack(hidden_states, dim=1)
            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
            hidden_states = (hidden_states *
                             norm_weights.view(-1, 1, 1)).sum(dim=1)
        else:
            hidden_states = outputs[0]

        hidden_states = self.projector(hidden_states)
        if attention_mask is None:
            pooled_output = hidden_states.mean(dim=1)
        else:
            padding_mask = self._get_feature_vector_attention_mask(
                hidden_states.shape[1], attention_mask)
            hidden_states[~padding_mask] = 0.0
            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(
                dim=1).view(-1, 1)

        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels),
                            labels.view(-1))

        if not return_dict:
            output = (logits, ) + outputs[_HIDDEN_STATES_START_POSITION:]
            return ((loss, ) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
示例#9
0
 def loss(self):
     """ """
     return CrossEntropyLoss()
示例#10
0
    def forward(self,
                beam_size=1,
                cls_ids=None,
                input_ids=None,
                attention_mask=None,
                token_type_ids=None,
                input_mask=None,
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
                start_positions=None,
                end_positions=None,
                p_mask=None,
                global_attention_mask=None):
        # set global attention on question tokens
        if global_attention_mask is None:
            # logger.info("Initializing global attention on question tokens...")
            # put global attention on all tokens until `config.sep_token_id` is reached
            # global_attention_mask = _compute_global_attention_mask(input_ids, self.config.sep_token_id)
            global_attention_mask = p_mask

        outputs = self.longformer(
            input_ids,
            global_attention_mask=global_attention_mask,
            attention_mask=attention_mask,
            # token_type_ids=None,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
        )

        hidden_states = outputs[0]
        start_logits = self.start_logits(hidden_states, p_mask=p_mask)

        outputs = outputs[
            1:]  # Keep mems, hidden states, attentions if there are in it

        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, let's remove the dimension added by batch splitting
            for x in (start_positions, end_positions):
                if x is not None and x.dim() > 1:
                    x.squeeze_(-1)

            # during training, compute the end logits based on the ground truth of the start position
            end_logits = self.end_logits(hidden_states,
                                         start_positions=start_positions,
                                         p_mask=p_mask)

            loss_fct = CrossEntropyLoss()

            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
            outputs = total_loss

        else:
            # during inference, compute the end logits based on beam search
            bsz, slen, hsz = hidden_states.size()
            start_log_probs = F.softmax(start_logits,
                                        dim=-1)  # shape (bsz, slen)
            #            start_log_probs = F.sigmoid(start_logits)

            start_top_log_probs, start_top_index = torch.topk(
                start_log_probs, beam_size, dim=-1)  # shape (bsz, start_n_top)
            start_top_index_exp = start_top_index.unsqueeze(-1).expand(
                -1, -1, hsz)  # shape (bsz, start_n_top, hsz)
            start_states = torch.gather(
                hidden_states, -2,
                start_top_index_exp)  # shape (bsz, start_n_top, hsz)
            start_states = start_states.unsqueeze(1).expand(
                -1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)

            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
                start_states)  # shape (bsz, slen, start_n_top, hsz)
            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
            end_logits = self.end_logits(hidden_states_expanded,
                                         start_states=start_states,
                                         p_mask=p_mask)
            end_log_probs = F.softmax(end_logits,
                                      dim=1)  # shape (bsz, slen, start_n_top)
            #            end_log_probs = F.sigmoid(end_logits)

            end_top_log_probs, end_top_index = torch.topk(
                end_log_probs, beam_size,
                dim=1)  # shape (bsz, end_n_top, start_n_top)
            end_top_log_probs = end_top_log_probs.view(-1,
                                                       beam_size * beam_size)
            end_top_index = end_top_index.view(-1, beam_size * beam_size)

            outputs = start_top_log_probs, start_top_index, end_top_log_probs, end_top_index

        return outputs
示例#11
0
    def forward(self,
                beam_size=1,
                cls_ids=None,
                input_ids=None,
                attention_mask=None,
                token_type_ids=None,
                input_mask=None,
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
                start_positions=None,
                end_positions=None,
                p_mask=None):
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
        )

        hidden_states = outputs[1][-3]
        #        hidden_states = outputs[0]
        #        print(hidden_states.shape)
        #        hidden_states = torch.cat((outputs[2][-1],outputs[2][-2], outputs[2][-3], outputs[2][-4]),-1)
        start_logits = self.start_logits(hidden_states, p_mask=p_mask)

        outputs = outputs[
            1:]  # Keep mems, hidden states, attentions if there are in it

        if start_positions is not None and end_positions is not None:
            # If we are on multi-GPU, let's remove the dimension added by batch splitting
            for x in (start_positions, end_positions):
                if x is not None and x.dim() > 1:
                    x.squeeze_(-1)

            # during training, compute the end logits based on the ground truth of the start position
            end_logits = self.end_logits(hidden_states,
                                         start_positions=start_positions,
                                         p_mask=p_mask)

            loss_fct = CrossEntropyLoss()

            start_loss = loss_fct(start_logits, start_positions)
            end_loss = loss_fct(end_logits, end_positions)
            total_loss = (start_loss + end_loss) / 2
            outputs = total_loss

        else:
            # during inference, compute the end logits based on beam search
            bsz, slen, hsz = hidden_states.size()
            start_log_probs = F.softmax(start_logits,
                                        dim=-1)  # shape (bsz, slen)

            start_top_log_probs, start_top_index = torch.topk(
                start_log_probs, beam_size, dim=-1)  # shape (bsz, start_n_top)
            start_top_index_exp = start_top_index.unsqueeze(-1).expand(
                -1, -1, hsz)  # shape (bsz, start_n_top, hsz)
            start_states = torch.gather(
                hidden_states, -2,
                start_top_index_exp)  # shape (bsz, start_n_top, hsz)
            start_states = start_states.unsqueeze(1).expand(
                -1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)

            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
                start_states)  # shape (bsz, slen, start_n_top, hsz)
            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
            end_logits = self.end_logits(hidden_states_expanded,
                                         start_states=start_states,
                                         p_mask=p_mask)
            end_log_probs = F.softmax(end_logits,
                                      dim=1)  # shape (bsz, slen, start_n_top)

            end_top_log_probs, end_top_index = torch.topk(
                end_log_probs, beam_size,
                dim=1)  # shape (bsz, end_n_top, start_n_top)
            end_top_log_probs = end_top_log_probs.view(-1,
                                                       beam_size * beam_size)
            end_top_index = end_top_index.view(-1, beam_size * beam_size)

            outputs = start_top_log_probs, start_top_index, end_top_log_probs, end_top_index

        return outputs
def main_worker(index, opt):
    random.seed(opt.manual_seed)
    np.random.seed(opt.manual_seed)
    torch.manual_seed(opt.manual_seed)

    if index >= 0 and opt.device.type == 'cuda':
        opt.device = torch.device(f'cuda:{index}')

    if opt.distributed:
        opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index
        dist.init_process_group(backend='nccl',
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.dist_rank)
        opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) # batch_size为总batch_size除以每个节点的显卡数
        opt.n_threads = int(
            (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node)
    opt.is_master_node = not opt.distributed or opt.dist_rank == 0

    model = generate_model(opt) # 加载模型,在model.py中修改加载的模型
    if opt.batchnorm_sync:
        assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.'
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    if opt.pretrain_path:
        model = load_pretrained_model(model, opt.pretrain_path, opt.model,
                                      opt.n_finetune_classes) # 如果预训练,则加载训练好的模型,微调使用pretrained_path
    if opt.resume_path is not None:
        model = resume_model(opt.resume_path, opt.arch, model) # 如果从中断的某个模型开始,测试使用resume_path
    model = make_data_parallel(model, opt.distributed, opt.device)

    if opt.pretrain_path:
        parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)
    else:
        parameters = model.parameters()

    if opt.is_master_node:
        print(model)

    criterion = CrossEntropyLoss().to(opt.device) # 交叉熵损失函数

    if not opt.no_train:
        (train_loader, train_sampler, train_logger, train_batch_logger,
         optimizer, scheduler) = get_train_utils(opt, parameters)
        if opt.resume_path is not None:
            opt.begin_epoch, optimizer, scheduler = resume_train_utils(
                opt.resume_path, opt.begin_epoch, optimizer, scheduler)
            if opt.overwrite_milestones:
                scheduler.milestones = opt.multistep_milestones
    if not opt.no_val:
        val_loader, val_logger = get_val_utils(opt)

    if opt.tensorboard and opt.is_master_node:
        from torch.utils.tensorboard import SummaryWriter
        if opt.begin_epoch == 1:
            tb_writer = SummaryWriter(log_dir=opt.result_path)
        else:
            tb_writer = SummaryWriter(log_dir=opt.result_path,
                                      purge_step=opt.begin_epoch)
    else:
        tb_writer = None

    prev_val_loss = None
    for i in range(opt.begin_epoch, opt.n_epochs + 1): # 开始训练
        if not opt.no_train:
            if opt.distributed:
                train_sampler.set_epoch(i)
            current_lr = get_lr(optimizer)
            train_epoch(i, train_loader, model, criterion, optimizer,
                        opt.device, current_lr, train_logger,
                        train_batch_logger, tb_writer, opt.distributed)

            if i % opt.checkpoint == 0 and opt.is_master_node:
                save_file_path = opt.result_path / 'save_{}.pth'.format(i)
                save_checkpoint(save_file_path, i, opt.arch, model, optimizer,
                                scheduler)

        if not opt.no_val:
            prev_val_loss = val_epoch(i, val_loader, model, criterion,
                                      opt.device, val_logger, tb_writer,
                                      opt.distributed)

        if not opt.no_train and opt.lr_scheduler == 'multistep':
            scheduler.step()
        elif not opt.no_train and opt.lr_scheduler == 'plateau':
            scheduler.step(prev_val_loss)

    if opt.inference:
        inference_loader, inference_class_names = get_inference_utils(opt)
        inference_result_path = opt.result_path / '{}.json'.format(
            opt.inference_subset)

        inference.inference(inference_loader, model, inference_result_path,
                            inference_class_names, opt.inference_no_average,
                            opt.output_topk)
示例#13
0
    def forward(self,
                input_ids,
                attention_mask=None,
                token_type_ids=None,
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
                valid_mask=None,
                start_positions=None,
                end_positions=None):
        outputs = self.distilbert(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  token_type_ids=token_type_ids,
                                  position_ids=position_ids,
                                  head_mask=head_mask,
                                  inputs_embeds=inputs_embeds)

        sequence_output = outputs[0]
        sequence_output, attention_mask = valid_sequence_output(
            sequence_output, valid_mask, attention_mask)
        sequence_output = self.dropout(sequence_output)
        start_logits = self.start_fc(sequence_output)

        if start_positions is not None and self.training:
            if self.soft_label:
                batch_size = input_ids.size(0)
                seq_len = input_ids.size(1)
                label_logits = torch.FloatTensor(batch_size, seq_len,
                                                 self.num_labels)
                label_logits.zero_()
                label_logits = label_logits.to(input_ids.device)
                label_logits.scatter_(2, start_positions.unsqueeze(2), 1)
            else:
                label_logits = start_positions.unsqueeze(2).float()
        else:
            label_logits = F.softmax(start_logits, -1)
            if not self.soft_label:
                label_logits = torch.argmax(label_logits,
                                            -1).unsqueeze(2).float()
        end_logits = self.end_fc(sequence_output, label_logits)
        outputs = (
            start_logits,
            end_logits,
        ) + outputs[2:]

        if start_positions is not None and end_positions is not None:
            assert self.loss_type in ['lsr', 'focal', 'ce']
            if self.loss_type == 'lsr':
                loss_fct = LabelSmoothingCrossEntropy()
            elif self.loss_type == 'focal':
                loss_fct = FocalLoss()
            else:
                loss_fct = CrossEntropyLoss()
            start_logits = start_logits.view(-1, self.num_labels)
            end_logits = end_logits.view(-1, self.num_labels)
            active_loss = attention_mask.view(-1) == 1
            active_start_logits = start_logits[active_loss]
            active_end_logits = end_logits[active_loss]

            active_start_labels = start_positions.view(-1)[active_loss]
            active_end_labels = end_positions.view(-1)[active_loss]

            start_loss = loss_fct(active_start_logits, active_start_labels)
            end_loss = loss_fct(active_end_logits, active_end_labels)
            total_loss = (start_loss + end_loss) / 2
            outputs = (total_loss, ) + outputs
        return outputs
示例#14
0
def main_worker(index, opt):
    random.seed(opt.manual_seed)
    np.random.seed(opt.manual_seed)
    torch.manual_seed(opt.manual_seed)

    if index >= 0 and opt.device.type == 'cuda':
        opt.device = torch.device(f'cuda:{index}')

    if opt.distributed:
        opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index
        dist.init_process_group(backend='nccl',
                                init_method=opt.dist_url,
                                world_size=opt.world_size,
                                rank=opt.dist_rank)
        opt.batch_size = int(opt.batch_size / opt.ngpus_per_node)
        opt.n_threads = int(
            (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node)
    opt.is_master_node = not opt.distributed or opt.dist_rank == 0

    if opt.inference:
        model = generate_model(opt)
    else:
        model = generate_model(opt, use_features=True)

    if opt.batchnorm_sync:
        assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.'
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
    if opt.pretrain_path:
        model = load_pretrained_model(model, opt.pretrain_path,
                                      opt.n_finetune_classes)
    if opt.resume_path is not None:
        model = resume_model(opt.resume_path, opt.arch, model)
    model = make_data_parallel(model, opt.distributed, opt.device)

    if opt.pretrain_path:
        parameters = get_fine_tuning_parameters(model, opt.ft_begin_module)
    else:
        parameters = model.parameters()

    if opt.is_master_node:
        print(model)

    #####################################################################################
    ### here add a classifier to predict videos and audios
    if opt.inference is False:
        ### define loss
        criterion = CrossEntropyLoss().to(opt.device)

        if opt.use_audio or opt.use_image:
            criterion_jsd = JSDLoss(weight=0.5)

        #################################################################################
        if opt.use_audio:
            ### define loss
            criterion_ct_av = NCELoss(temperature=0.5)
            ### audio teacher model
            feature_dim = 512 * 2
            if opt.pretrain_path is not None:
                joint_prediction_aud = generate_prediction(
                    feature_dim, opt.n_finetune_classes, normalization=True)
            else:
                joint_prediction_aud = generate_prediction(feature_dim,
                                                           opt.n_classes,
                                                           normalization=True)
            if opt.resume_path is not None:
                aux_checkpoint = Path(
                    os.path.join(str(opt.resume_path.parent),
                                 str(opt.resume_path.name[:-4] +
                                     '_audio.pth')))
                joint_prediction_aud = resume_model(aux_checkpoint, opt.arch,
                                                    joint_prediction_aud)

            joint_prediction_aud = make_data_parallel(joint_prediction_aud,
                                                      opt.distributed,
                                                      opt.device)
            aud_para = joint_prediction_aud.parameters()
            joint_prediction_aud.cuda()
        else:
            aud_para = None

        #################################################################################
        if opt.use_image:
            ### define loss
            criterion_ct_iv = NCELoss(temperature=0.1)
            ### image teacher model
            image_model = torchvision.models.resnet34(pretrained=True)
            # remove the fc layers (only use the image features)
            image_model = torch.nn.Sequential(
                *list(image_model.children())[:-1])
            image_model = make_data_parallel(image_model, opt.distributed,
                                             opt.device)
            feature_dim = 512 * 2
            if opt.pretrain_path is not None:
                joint_prediction_img = generate_prediction(
                    feature_dim, opt.n_finetune_classes, normalization=True)
            else:
                joint_prediction_img = generate_prediction(feature_dim,
                                                           opt.n_classes,
                                                           normalization=True)
            if opt.resume_path is not None:
                aux_checkpoint = Path(
                    os.path.join(str(opt.resume_path.parent),
                                 str(opt.resume_path.name[:-4] +
                                     '_image.pth')))
                joint_prediction_img = resume_model(aux_checkpoint, opt.arch,
                                                    joint_prediction_img)

            joint_prediction_img = make_data_parallel(joint_prediction_img,
                                                      opt.distributed,
                                                      opt.device)
            img_para = joint_prediction_img.parameters()
            joint_prediction_img.cuda()
        else:
            img_para = None

        #################################################################################
        (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, optimizer_av, optimizer_iv, scheduler) = \
            get_train_utils(opt, model_parameters=parameters, av_parameters=aud_para, iv_parameters=img_para)

        if opt.resume_path is not None:
            opt.begin_epoch, optimizer, scheduler = resume_train_utils(
                opt.resume_path, opt.begin_epoch, optimizer, scheduler)
            if opt.overwrite_milestones:
                scheduler.milestones = opt.multistep_milestones

    if not opt.no_val:
        val_loader, val_logger = get_val_utils(opt)

    if opt.tensorboard and opt.is_master_node:
        from torch.utils.tensorboard import SummaryWriter
        if opt.begin_epoch == 1:
            tb_writer = SummaryWriter(log_dir=opt.result_path)
        else:
            tb_writer = SummaryWriter(log_dir=opt.result_path,
                                      purge_step=opt.begin_epoch)
    else:
        tb_writer = None

    prev_val_loss = None
    pre_val_acc = 0.0
    if opt.image_size > opt.sample_size:
        image_size = opt.image_size
    else:
        image_size = None
    for i in range(opt.begin_epoch, opt.n_epochs + 1):
        if not opt.no_train:
            if opt.distributed:
                train_sampler.set_epoch(i)
            current_lr = get_lr(optimizer)
            if optimizer_av is None and optimizer_iv is None:
                train_epoch(epoch=i,
                            data_loader=train_loader,
                            model=model,
                            criterion=criterion,
                            optimizer=optimizer,
                            device=opt.device,
                            current_lr=current_lr,
                            epoch_logger=train_logger,
                            batch_logger=train_batch_logger,
                            tb_writer=tb_writer,
                            distributed=opt.distributed)
            elif optimizer_av is not None and optimizer_iv is None:
                train_a_epoch(epoch=i,
                              data_loader=train_loader,
                              model=model,
                              joint_prediction_aud=joint_prediction_aud,
                              criterion=criterion,
                              criterion_jsd=criterion_jsd,
                              criterion_ct_av=criterion_ct_av,
                              optimizer=optimizer,
                              optimizer_av=optimizer_av,
                              device=opt.device,
                              current_lr=current_lr,
                              epoch_logger=train_logger,
                              batch_logger=train_batch_logger,
                              tb_writer=tb_writer,
                              distributed=opt.distributed)
            elif optimizer_av is None and optimizer_iv is not None:
                train_i_epoch(epoch=i,
                              data_loader=train_loader,
                              model=model,
                              image_model=image_model,
                              joint_prediction_img=joint_prediction_img,
                              criterion=criterion,
                              criterion_jsd=criterion_jsd,
                              criterion_ct_iv=criterion_ct_iv,
                              optimizer=optimizer,
                              optimizer_iv=optimizer_iv,
                              device=opt.device,
                              current_lr=current_lr,
                              epoch_logger=train_logger,
                              batch_logger=train_batch_logger,
                              tb_writer=tb_writer,
                              distributed=opt.distributed,
                              image_size=image_size)
            else:
                train_ai_epoch(epoch=i,
                               data_loader=train_loader,
                               model=model,
                               image_model=image_model,
                               joint_prediction_aud=joint_prediction_aud,
                               joint_prediction_img=joint_prediction_img,
                               criterion=criterion,
                               criterion_jsd=criterion_jsd,
                               criterion_ct_av=criterion_ct_av,
                               criterion_ct_iv=criterion_ct_iv,
                               optimizer=optimizer,
                               optimizer_av=optimizer_av,
                               optimizer_iv=optimizer_iv,
                               device=opt.device,
                               current_lr=current_lr,
                               epoch_logger=train_logger,
                               batch_logger=train_batch_logger,
                               tb_writer=tb_writer,
                               distributed=opt.distributed,
                               image_size=image_size,
                               loss_weight=opt.loss_weight)

            if i % opt.checkpoint == 0 and opt.is_master_node:
                save_file_path = opt.result_path / 'save_{}.pth'.format(i)
                save_checkpoint(save_file_path, i, opt.arch, model, optimizer,
                                scheduler)
                if opt.use_audio:
                    save_file_path = opt.result_path / 'save_{}_audio.pth'.format(
                        i)
                    save_checkpoint(save_file_path, i, opt.arch,
                                    joint_prediction_aud, optimizer, scheduler)
                if opt.use_image:
                    save_file_path = opt.result_path / 'save_{}_image.pth'.format(
                        i)
                    save_checkpoint(save_file_path, i, opt.arch,
                                    joint_prediction_img, optimizer, scheduler)
            if not opt.no_val and i % opt.val_freq == 0:
                prev_val_loss, val_acc = val_epoch(i, val_loader, model,
                                                   criterion, opt.device,
                                                   val_logger, tb_writer,
                                                   opt.distributed)
                if pre_val_acc < val_acc:
                    pre_val_acc = val_acc
                    save_file_path = opt.result_path / 'save_model.pth'
                    save_checkpoint(save_file_path, i, opt.arch, model,
                                    optimizer, scheduler)

            if not opt.no_train and opt.lr_scheduler == 'multistep':
                scheduler.step()
            elif not opt.no_train and opt.lr_scheduler == 'plateau':
                if prev_val_loss is not None:
                    scheduler.step(prev_val_loss)

    if opt.inference:
        inference_loader, inference_class_names = get_inference_utils(opt)
        inference_result_path = opt.result_path / '{}.json'.format(
            opt.inference_subset)
        inference.inference(inference_loader, model, inference_result_path,
                            inference_class_names, opt.inference_no_average,
                            opt.output_topk)
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")

    parser.add_argument('--kshot',
                        type=int,
                        default=5,
                        help="random seed for initialization")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=1e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")

    args = parser.parse_args()

    processors = {"rte": RteProcessor}

    output_modes = {"rte": "classification"}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]

    train_examples = processor.get_GAP_coreference(
        'gap-development.tsv', args.kshot)  #train_pu_half_v1.txt
    dev_examples = processor.get_GAP_coreference('gap-validation.tsv', 0)
    test_examples = processor.get_GAP_coreference('gap-test.tsv', 0)
    label_list = ["entailment", "not_entailment"]
    entity_label_list = ["A-coref", "B-coref"]
    # train_examples = get_data_hulu_fewshot('train', 5)
    # train_examples, dev_examples, test_examples, label_list = load_CLINC150_with_specific_domain_sequence(args.DomainName, args.kshot, augment=False)
    num_labels = len(label_list)
    print('num_labels:', num_labels, 'training size:', len(train_examples),
          'dev size:', len(dev_examples), 'test size:', len(test_examples))

    num_train_optimization_steps = None
    num_train_optimization_steps = int(
        len(train_examples) / args.train_batch_size /
        args.gradient_accumulation_steps) * args.num_train_epochs
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    model = RobertaForSequenceClassification(num_labels)
    tokenizer = RobertaTokenizer.from_pretrained(
        pretrain_model_dir, do_lower_case=args.do_lower_case)
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    max_test_acc = 0.0
    max_dev_acc = 0.0
    max_dev_threshold = 0.0
    if args.do_train:
        train_dataloader = examples_to_features(train_examples,
                                                label_list,
                                                entity_label_list,
                                                args,
                                                tokenizer,
                                                args.train_batch_size,
                                                "classification",
                                                dataloader_mode='random')
        dev_dataloader = examples_to_features(dev_examples,
                                              label_list,
                                              entity_label_list,
                                              args,
                                              tokenizer,
                                              args.eval_batch_size,
                                              "classification",
                                              dataloader_mode='sequential')
        test_dataloader = examples_to_features(test_examples,
                                               label_list,
                                               entity_label_list,
                                               args,
                                               tokenizer,
                                               args.eval_batch_size,
                                               "classification",
                                               dataloader_mode='sequential')

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)

        iter_co = 0
        final_test_performance = 0.0
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                model.train()
                batch = tuple(t.to(device) for t in batch)
                input_example_ids, input_ids, input_mask, segment_ids, label_ids, entity_label_ids = batch

                logits = model(input_ids, input_mask)
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, num_labels),
                                label_ids.view(-1))

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                optimizer.step()
                optimizer.zero_grad()
                global_step += 1
                iter_co += 1
                # if iter_co %100==0:
                #     print('iter_co:', iter_co, ' mean loss:', tr_loss/iter_co)
                if iter_co % len(train_dataloader) == 0:

                    model.eval()
                    '''
                     dev set after this epoch
                    '''

                    logger.info("***** Running dev *****")
                    logger.info("  Num examples = %d", len(dev_examples))

                    eval_loss = 0
                    nb_eval_steps = 0
                    preds = []
                    gold_label_ids = []
                    example_id_list = []
                    for _, batch in enumerate(tqdm(dev_dataloader,
                                                   desc="dev")):
                        input_indices, input_ids, input_mask, segment_ids, _, label_ids = batch
                        input_ids = input_ids.to(device)
                        input_mask = input_mask.to(device)
                        segment_ids = segment_ids.to(device)
                        label_ids = label_ids.to(device)
                        example_ids = list(input_indices.numpy())
                        example_id_list += example_ids
                        gold_label_ids += list(
                            label_ids.detach().cpu().numpy())

                        with torch.no_grad():
                            logits = model(input_ids, input_mask)
                        if len(preds) == 0:
                            preds.append(logits.detach().cpu().numpy())
                        else:
                            preds[0] = np.append(preds[0],
                                                 logits.detach().cpu().numpy(),
                                                 axis=0)

                    preds = preds[0]

                    pred_probs = softmax(preds, axis=1)
                    pred_label_ids_3way = list(np.argmax(pred_probs, axis=1))
                    pred_prob_entail = list(pred_probs[:, 0])

                    assert len(example_id_list) == len(pred_prob_entail)
                    assert len(example_id_list) == len(gold_label_ids)
                    assert len(example_id_list) == len(pred_label_ids_3way)

                    best_current_dev_acc = 0.0
                    best_current_threshold = -10.0
                    for threshold in np.arange(0.99, 0.0, -0.01):
                        eval_output_list = build_GAP_output_format(
                            example_id_list,
                            gold_label_ids,
                            pred_prob_entail,
                            pred_label_ids_3way,
                            threshold,
                            dev_or_test='validation')
                        dev_acc = run_scorer(
                            '/export/home/Dataset/gap_coreference/gap-validation.tsv',
                            eval_output_list)
                        if dev_acc > best_current_dev_acc:
                            best_current_dev_acc = dev_acc
                            best_current_threshold = threshold
                    print('best_current_dev_threshold:',
                          best_current_threshold, 'best_current_dev_acc:',
                          best_current_dev_acc)

                    if best_current_dev_acc > max_dev_acc:
                        max_dev_acc = best_current_dev_acc
                        max_dev_threshold = best_current_threshold
                        '''eval on test set'''
                        logger.info("***** Running test *****")
                        logger.info("  Num examples = %d", len(test_examples))

                        eval_loss = 0
                        nb_eval_steps = 0
                        preds = []
                        gold_label_ids = []
                        example_id_list = []
                        for _, batch in enumerate(
                                tqdm(test_dataloader, desc="test")):
                            input_indices, input_ids, input_mask, segment_ids, _, label_ids = batch
                            input_ids = input_ids.to(device)
                            input_mask = input_mask.to(device)
                            segment_ids = segment_ids.to(device)
                            label_ids = label_ids.to(device)
                            example_ids = list(input_indices.numpy())
                            example_id_list += example_ids
                            gold_label_ids += list(
                                label_ids.detach().cpu().numpy())

                            with torch.no_grad():
                                logits = model(input_ids, input_mask)
                            if len(preds) == 0:
                                preds.append(logits.detach().cpu().numpy())
                            else:
                                preds[0] = np.append(
                                    preds[0],
                                    logits.detach().cpu().numpy(),
                                    axis=0)

                        preds = preds[0]

                        pred_probs = softmax(preds, axis=1)
                        pred_label_ids_3way = list(
                            np.argmax(pred_probs, axis=1))
                        pred_prob_entail = list(pred_probs[:, 0])

                        assert len(example_id_list) == len(pred_prob_entail)
                        assert len(example_id_list) == len(gold_label_ids)
                        assert len(example_id_list) == len(pred_label_ids_3way)

                        threshold = max_dev_threshold
                        eval_output_list = build_GAP_output_format(
                            example_id_list,
                            gold_label_ids,
                            pred_prob_entail,
                            pred_label_ids_3way,
                            threshold,
                            dev_or_test='test')

                        test_acc = run_scorer(
                            '/export/home/Dataset/gap_coreference/gap-test.tsv',
                            eval_output_list)
                        if test_acc > max_test_acc:
                            max_test_acc = test_acc
                        print('current_test_acc:', test_acc, ' max_test_acc:',
                              max_test_acc)
                        final_test_performance = test_acc
        print('final_test_performance:', final_test_performance)
示例#16
0
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        enc_hiddens=None,
        encoder_attention_mask=None,
        caches=None,
        labels=None,
        y_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.rembert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            enc_hiddens=enc_hiddens,
            encoder_attention_mask=encoder_attention_mask,
            caches=caches,
            y_cache=y_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]
        prediction_scores = self.cls(sequence_output)

        lm_loss = None
        if labels is not None:
            # we are doing next-token prediction; shift prediction scores and input ids by one
            shifted_prediction_scores = prediction_scores[:, :
                                                          -1, :].contiguous()
            labels = labels[:, 1:].contiguous()
            loss_fct = CrossEntropyLoss()
            lm_loss = loss_fct(
                shifted_prediction_scores.view(-1, self.config.s_vocab),
                labels.view(-1))

        if not return_dict:
            output = (prediction_scores, ) + outputs[2:]
            return ((lm_loss, ) + output) if lm_loss is not None else output

        return CausalLMOutputWithCrossAttentions(
            loss=lm_loss,
            logits=prediction_scores,
            caches=outputs.caches,
            hiddens=outputs.hiddens,
            attns=outputs.attns,
            crosses=outputs.crosses,
        )
示例#17
0
    def forward(self, q_vec, d_vec, sd_vec, labels=None):
        # embedding input vector
        q_emb = self.embedding(q_vec)
        d_emb = self.embedding(d_vec)
        sd_emb = self.embedding(sd_vec)

        q_transform = self.q_transformer(q_emb)
        d_transform = self.d_transformer(d_emb)
        sd_transform = self.sd_transformer(sd_emb)

        # Residual Net
        q_res = torch.cat((q_transform, q_emb), 2)
        d_res = torch.cat((d_transform, d_emb), 2)
        sd_res = torch.cat((sd_transform, sd_emb), 2)

        # alignment
        q_d_similarity = self.similarity(d_res, q_res)
        d2q = self.context_to_query(q_d_similarity, d_res)
        q2d = self.query_to_context(q_d_similarity, q_res)
        q_d_final = self.final_attention(d_res, d2q, q2d)

        q_sd_similarity = self.similarity(sd_res, q_res)
        sd2q = self.context_to_query(q_sd_similarity, sd_res)
        q2sd = self.query_to_context(q_sd_similarity, q_res)
        q_sd_final = self.final_attention(sd_res, sd2q, q2sd)

        q_d_concat = torch.cat((q_res, q_d_final, d_res),
                               2)  # [batch_size, 128, embedding_size*12]
        q_sd_concat = torch.cat((q_res, q_sd_final, sd_res),
                                2)  # [batch_size, 128, embedding_size*12]

        q_d_linear = self.q_d_Linear(
            q_d_concat)  # [batch_size, 128, embedding_size*12]
        q_sd_linear = self.q_sd_Linear(
            q_sd_concat)  # [batch_size, 128, embedding_size*12]

        all_concat_input = torch.cat((q_d_linear, q_sd_linear),
                                     2)  # [batch_size, 128, embedding_size*24]
        all_concat_input = all_concat_input.to(self.device)
        result_output = self.result_Linear(
            all_concat_input)  # [batch_size, 128, embedding_size*24]
        result_output = result_output.to(self.device)

        result_output = result_output.permute(0, 2, 1)
        avg_pool = F.adaptive_avg_pool1d(result_output, 1)
        max_pool = F.adaptive_max_pool1d(result_output, 1)

        avg_pool = avg_pool.view(q_vec.size(0), -1)
        max_pool = max_pool.view(q_vec.size(0), -1)

        result = torch.cat((avg_pool, max_pool),
                           1)  # [batch_size, embedding_size*48]

        logits = self.classifier(result)

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits, labels)
            return loss, logits
        else:
            return logits
示例#18
0
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        num_choices = input_ids.shape[
            1] if input_ids is not None else inputs_embeds.shape[1]

        input_ids = input_ids.view(
            -1, input_ids.size(-1)) if input_ids is not None else None
        attention_mask = (attention_mask.view(-1, attention_mask.size(-1))
                          if attention_mask is not None else None)
        token_type_ids = (token_type_ids.view(-1, token_type_ids.size(-1))
                          if token_type_ids is not None else None)
        position_ids = (position_ids.view(-1, position_ids.size(-1))
                        if position_ids is not None else None)
        inputs_embeds = (inputs_embeds.view(-1, inputs_embeds.size(-2),
                                            inputs_embeds.size(-1))
                         if inputs_embeds is not None else None)

        outputs = self.rembert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.drop(pooled_output)
        logits = self.classifier(pooled_output)
        reshaped_logits = logits.view(-1, num_choices)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, labels)

        if not return_dict:
            output = (reshaped_logits, ) + outputs[2:]
            return ((loss, ) + output) if loss is not None else output

        return qo.WithLoss(
            loss=loss,
            logits=reshaped_logits,
            hiddens=outputs.hiddens,
            attns=outputs.attns,
        )
    def forward(self,
                input_ids=None,
                input_ids_org=None,
                attention_mask=None,
                token_type_ids=None,
                position_ids=None,
                head_mask=None,
                inputs_embeds=None,
                labels=None,
                output_attentions=None,
                output_hidden_states=None,
                return_dict=None,
                func=None,
                tail_idxs=None,
                in_domain_rep=None,
                out_domain_rep=None,
                sentence_label=None,
                lm_label=None,
                batch_size=None,
                all_in_task_rep_comb=None,
                all_sentence_binary_label=None,
                from_query=False,
                **kwargs):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for computing the masked language modeling loss.
            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
            in ``[0, ..., config.vocab_size]``
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
                FutureWarning,
            )
            labels = kwargs.pop("masked_lm_labels")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if func == "in_domain_task_rep":
            #######
            outputs = self.roberta(
                input_ids=input_ids_org,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
            #######
            #x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
            #rep = outputs.last_hidden_state[:, 0, :]
            #rep = outputs.last_hidden_state[:, 0, :]
            rep_head = outputs.last_hidden_state[:, 0, :]
            rep_tail = outputs.last_hidden_state[input_ids_org == 2]
            #detach
            #rep = rep.detach()
            '''
            in_domain_rep = self.domain_layer(rep)
            in_task_rep = self.task_layer(rep)
            return in_domain_rep, in_task_rep
            '''
            return rep_tail, rep_head

        elif func == "in_domain_task_rep_mean":
            #######
            outputs = self.roberta(
                input_ids=input_ids_org,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
            #######
            #x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
            rep = outputs.last_hidden_state
            mask = rep != 0
            rep = (rep * mask).sum(dim=1) / mask.sum(dim=1)

            #detach
            #rep = rep.detach()
            '''
            in_domain_rep = self.domain_layer(rep)
            in_task_rep = self.task_layer(rep)
            return in_domain_rep, in_task_rep
            '''
            return rep, rep

        elif func == "return_task_binary_classifier":
            return self.task_binary_classifier.weight.data, self.task_binary_classifier.bias.data

        elif func == "return_domain_binary_classifier":
            return self.domain_binary_classifier.weight.data, self.domain_binary_classifier.bias.data

        #if func == "task_binary_classifier":

        elif func == "domain_binary_classifier":
            #in:1 , out:0
            #Need to fix
            #######
            outputs = self.roberta(
                input_ids=input_ids_org,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
            #######
            #Didn't include query rep: so it need to add in_domain_rep here
            loss_fct = CrossEntropyLoss()
            out_domain_rep_head = outputs.last_hidden_state[:, 0, :]
            out_domain_rep_tail = outputs.last_hidden_state[input_ids_org == 2]
            #print("model_head",out_domain_rep_head.shape)
            #print("model_tail",out_domain_rep_tail.shape)
            domain_rep = torch.cat([in_domain_rep, out_domain_rep_tail], 0)
            #detach
            #domain_rep = domain_rep.detach()
            logit = self.domain_binary_classifier(domain_rep)
            logit = self.LeakyReLU(logit)
            pos_target = torch.tensor([1] * in_domain_rep.shape[0]).to("cuda")
            neg_target = torch.tensor([0] *
                                      out_domain_rep_tail.shape[0]).to("cuda")
            target = torch.cat([pos_target, neg_target], 0)
            domain_loss = loss_fct(logit, target)

            return domain_loss, logit, out_domain_rep_head, out_domain_rep_tail

        elif func == "domain_binary_classifier_mean":
            #in:1 , out:0
            #Need to fix
            #######
            outputs = self.roberta(
                input_ids=input_ids_org,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
            #######
            #Didn't include query rep: so it need to add in_domain_rep here
            loss_fct = CrossEntropyLoss()
            out_domain_rep = outputs.last_hidden_state
            ###
            mask = out_domain_rep != 0
            out_domain_rep = (out_domain_rep *
                              mask).sum(dim=1) / mask.sum(dim=1)
            ###
            domain_rep = torch.cat([in_domain_rep, out_domain_rep], 0)
            #detach
            #domain_rep = domain_rep.detach()
            logit = self.domain_binary_classifier(domain_rep)
            logit = self.LeakyReLU(logit)
            pos_target = torch.tensor([1] * in_domain_rep.shape[0]).to("cuda")
            neg_target = torch.tensor([0] * out_domain_rep.shape[0]).to("cuda")
            target = torch.cat([pos_target, neg_target], 0)
            domain_loss = loss_fct(logit, target)
            return domain_loss, logit

        elif func == "task_binary_classifier":
            #Didn't include query rep: so it need to add in_domain_rep here
            loss_fct = CrossEntropyLoss()
            #detach
            #all_in_task_rep_comb = all_in_task_rep_comb.detach()
            logit = self.task_binary_classifier(all_in_task_rep_comb)
            logit = self.LeakyReLU(logit)
            all_sentence_binary_label = all_sentence_binary_label.reshape(
                all_sentence_binary_label.shape[0] *
                all_sentence_binary_label.shape[1])
            logit = logit.reshape(logit.shape[0] * logit.shape[1],
                                  logit.shape[2])
            task_binary_loss = loss_fct(logit.view(-1, 2),
                                        all_sentence_binary_label.view(-1))
            return task_binary_loss, logit

        elif func == "task_binary_classifier_mean":
            #Didn't include query rep: so it need to add in_domain_rep here
            loss_fct = CrossEntropyLoss()
            #detach
            #all_in_task_rep_comb = all_in_task_rep_comb.detach()
            logit = self.task_binary_classifier(all_in_task_rep_comb)
            logit = self.LeakyReLU(logit)
            all_sentence_binary_label = all_sentence_binary_label.reshape(
                all_sentence_binary_label.shape[0] *
                all_sentence_binary_label.shape[1])
            logit = logit.reshape(logit.shape[0] * logit.shape[1],
                                  logit.shape[2])
            task_binary_loss = loss_fct(logit.view(-1, 2),
                                        all_sentence_binary_label.view(-1))
            return task_binary_loss, logit

        elif func == "task_class":
            #######
            outputs = self.roberta(
                input_ids=input_ids_org,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
            #######
            #Already including query rep
            loss_fct = CrossEntropyLoss()
            ###
            #class_logit = self.classifier(outputs.last_hidden_state, input_ids_org)
            class_logit = self.classifier(outputs.last_hidden_state)
            task_loss = loss_fct(class_logit.view(-1, self.num_labels),
                                 sentence_label.view(-1))

            if from_query == True:
                query_rep_head = outputs.last_hidden_state[:, 0, :]
                query_rep_tail = outputs.last_hidden_state[input_ids_org == 2]
                return task_loss, class_logit, query_rep_head, query_rep_tail
            else:
                return task_loss, class_logit

        elif func == "mlm":
            outputs_mlm = self.roberta(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )

            loss_fct = CrossEntropyLoss()
            #sequence_output = outputs_mlm.last_hidden_state
            sequence_output = outputs_mlm[0]
            prediction_scores = self.lm_head(sequence_output)
            loss_fct = CrossEntropyLoss(ignore_index=-1)
            masked_lm_loss = loss_fct(
                prediction_scores.view(-1, self.config.vocab_size),
                lm_label.view(-1))
            return masked_lm_loss

        elif func == "task_class and mlm":
            #######
            outputs = self.roberta(
                input_ids=input_ids_org,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
            #######
            #######
            outputs_mlm = self.roberta(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
            #######
            #Already including query rep
            #task loss
            loss_fct = CrossEntropyLoss()
            ###
            '''
            #rep = outputs.last_hidden_state[input_ids==2]
            rep = outputs.last_hidden_state[:, 0, :]
            #rep = rep.detach()
            task_rep = self.task_layer(rep)
            class_logit = self.layer_out_taskClass((self.act(task_rep)))
            '''
            class_logit = self.classifier(outputs.last_hidden_state)
            ###
            task_loss = loss_fct(class_logit.view(-1, 8),
                                 sentence_label.view(-1))

            #mlm loss
            sequence_output = outputs_mlm.last_hidden_state
            prediction_scores = self.lm_head(sequence_output)
            loss_fct = CrossEntropyLoss(ignore_index=-1)
            masked_lm_loss = loss_fct(
                prediction_scores.view(-1, self.config.vocab_size),
                lm_label.view(-1))
            return task_loss, masked_lm_loss

        elif func == "gen_rep":
            outputs = self.roberta(
                input_ids=input_ids_org,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
            return outputs
        '''
示例#20
0
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        head_mask=None,
        cross_attn_head_mask=None,
        past_key_values=None,
        inputs_embeds=None,
        labels=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`Speech2Text2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                if the model is configured as a decoder.
            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
                in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
                tensors are only required when the model is used as a decoder in a Sequence to Sequence model.

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.

        Returns:

        Example:

        ```python
        >>> from transformers import (
        ...     SpeechEncoderDecoderModel,
        ...     Speech2Text2ForCausalLM,
        ...     Wav2Vec2Model,
        ...     Speech2Text2Config,
        ...     Wav2Vec2Config,
        ...     Wav2Vec2FeatureExtractor,
        ...     Speech2Text2Tokenizer,
        ... )
        >>> from datasets import load_dataset

        >>> feature_extractor = Wav2Vec2FeatureExtractor()
        >>> tokenizer = Speech2Text2Tokenizer.from_pretrained("facebook/s2t-wav2vec2-large-en-de")

        >>> encoder = Wav2Vec2Model(Wav2Vec2Config())
        >>> decoder = Speech2Text2ForCausalLM(Speech2Text2Config())
        >>> # init random speech2text model

        >>> model = SpeechEncoderDecoderModel(encoder=encoder, decoder=decoder)
        >>> model.config.pad_token_id = tokenizer.pad_token_id
        >>> model.config.decoder_start_token_id = tokenizer.bos_token_id
        >>> # pre-process inputs and labels

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(
        ...     ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt"
        ... )
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = tokenizer(ds[0]["text"], return_tensors="pt").input_ids
        >>> # compute loss

        >>> loss = model(inputs=input_values, labels=decoder_input_ids).loss
        >>> # backprop loss

        >>> loss.backward()  # doctest: +IGNORE_RESULT
        ```"""

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (output_hidden_states
                                if output_hidden_states is not None else
                                self.config.output_hidden_states)
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
        outputs = self.model.decoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            head_mask=head_mask,
            cross_attn_head_mask=cross_attn_head_mask,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        logits = self.lm_head(outputs[0])

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.vocab_size),
                            labels.view(-1))

        if not return_dict:
            output = (logits, ) + outputs[1:]
            return (loss, ) + output if loss is not None else output

        return CausalLMOutputWithCrossAttentions(
            loss=loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
            cross_attentions=outputs.cross_attentions,
        )
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        #####
        #return outputs
        #####

        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            if self.num_labels == 1:
                #  We are doing regression
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels),
                                labels.view(-1))

        if not return_dict:
            output = (logits, ) + outputs[2:]
            return ((loss, ) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
示例#22
0
def Train(model, t, loader, eps_scheduler, max_eps, norm, logger, verbose,
          train, opt, method, **kwargs):
    # if train=True, use training mode
    # if train=False, use test mode, no back prop

    num_class = 10
    losses = AverageMeter()
    l1_losses = AverageMeter()
    errors = AverageMeter()
    robust_errors = AverageMeter()
    regular_ce_losses = AverageMeter()
    robust_ce_losses = AverageMeter()
    relu_activities = AverageMeter()
    bound_bias = AverageMeter()
    bound_diff = AverageMeter()
    unstable_neurons = AverageMeter()
    dead_neurons = AverageMeter()
    alive_neurons = AverageMeter()
    batch_time = AverageMeter()
    batch_multiplier = kwargs.get("batch_multiplier", 1)
    kappa = 1
    beta = 1
    if train:
        model.train()
    else:
        model.eval()
    # pregenerate the array for specifications, will be used for scatter
    sa = np.zeros((num_class, num_class - 1), dtype=np.int32)
    for i in range(sa.shape[0]):
        for j in range(sa.shape[1]):
            if j < i:
                sa[i][j] = j
            else:
                sa[i][j] = j + 1
    sa = torch.LongTensor(sa)
    batch_size = loader.batch_size * batch_multiplier
    if batch_multiplier > 1 and train:
        logger.log(
            'Warning: Large batch training. The equivalent batch size is {} * {} = {}.'
            .format(batch_multiplier, loader.batch_size, batch_size))
    # per-channel std and mean
    std = torch.tensor(loader.std).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
    mean = torch.tensor(loader.mean).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)

    model_range = 0.0
    end_eps = eps_scheduler.get_eps(t + 1, 0)
    if end_eps < np.finfo(np.float32).tiny:
        logger.log('eps {} close to 0, using natural training'.format(end_eps))
        method = "natural"
    lb_batches = []
    for i, (data, labels) in enumerate(loader):
        start = time.time()
        eps = eps_scheduler.get_eps(t, int(i // batch_multiplier))
        if train and i % batch_multiplier == 0:
            opt.zero_grad()
        # generate specifications
        c = torch.eye(num_class).type_as(data)[labels].unsqueeze(
            1) - torch.eye(num_class).type_as(data).unsqueeze(0)
        # remove specifications to self
        I = (~(labels.data.unsqueeze(1) == torch.arange(num_class).type_as(
            labels.data).unsqueeze(0)))
        c = (c[I].view(data.size(0), num_class - 1, num_class))
        # scatter matrix to avoid compute margin to self
        sa_labels = sa[labels]
        # storing computed lower bounds after scatter
        lb_s = torch.zeros(data.size(0), num_class)
        ub_s = torch.zeros(data.size(0), num_class)

        # FIXME: Assume unnormalized data is from range 0 - 1
        if kwargs["bounded_input"]:
            if norm != np.inf:
                raise ValueError(
                    "bounded input only makes sense for Linf perturbation. "
                    "Please set the bounded_input option to false.")
            data_max = torch.reshape((1. - mean) / std, (1, -1, 1, 1))
            data_min = torch.reshape((0. - mean) / std, (1, -1, 1, 1))
            data_ub = torch.min(data + (eps / std), data_max)
            data_lb = torch.max(data - (eps / std), data_min)
        else:
            if norm == np.inf:
                data_ub = data + (eps / std)
                data_lb = data - (eps / std)
            else:
                # For other norms, eps will be used instead.
                data_ub = data_lb = data

        if list(model.parameters())[0].is_cuda:
            data = data.cuda()
            data_ub = data_ub.cuda()
            data_lb = data_lb.cuda()
            labels = labels.cuda()
            c = c.cuda()
            sa_labels = sa_labels.cuda()
            lb_s = lb_s.cuda()
            ub_s = ub_s.cuda()
        # convert epsilon to a tensor
        eps_tensor = data.new(1)
        eps_tensor[0] = eps

        # omit the regular cross entropy, since we use robust error
        output = model(data,
                       method_opt="forward",
                       disable_multi_gpu=(method == "natural"))
        regular_ce = CrossEntropyLoss()(output, labels)
        regular_ce_losses.update(regular_ce.cpu().detach().numpy(),
                                 data.size(0))
        errors.update(
            torch.sum(
                torch.argmax(output, dim=1) != labels).cpu().detach().numpy() /
            data.size(0), data.size(0))
        # get range statistic
        model_range = output.max().detach().cpu().item() - output.min().detach(
        ).cpu().item()
        '''
        torch.set_printoptions(threshold=5000)
        print('prediction:  ', output)
        ub, lb, _, _, _, _ = model(norm=norm, x_U=data_ub, x_L=data_lb, eps=eps, C=c, method_opt="interval_range")
        lb = lb_s.scatter(1, sa_labels, lb)
        ub = ub_s.scatter(1, sa_labels, ub)
        print('interval ub: ', ub)
        print('interval lb: ', lb)
        ub, _, lb, _ = model(norm=norm, x_U=data_ub, x_L=data_lb, eps=eps, C=c, upper=True, lower=True, method_opt="backward_range")
        lb = lb_s.scatter(1, sa_labels, lb)
        ub = ub_s.scatter(1, sa_labels, ub)
        print('crown-ibp ub: ', ub)
        print('crown-ibp lb: ', lb) 
        ub, _, lb, _ = model(norm=norm, x_U=data_ub, x_L=data_lb, eps=eps, C=c, upper=True, lower=True, method_opt="full_backward_range")
        lb = lb_s.scatter(1, sa_labels, lb)
        ub = ub_s.scatter(1, sa_labels, ub)
        print('full-crown ub: ', ub)
        print('full-crown lb: ', lb)
        input()
        '''

        if verbose or method != "natural":
            if kwargs["bound_type"] == "convex-adv":
                # Wong and Kolter's bound, or equivalently Fast-Lin
                if kwargs["convex-proj"] is not None:
                    proj = kwargs["convex-proj"]
                    if norm == np.inf:
                        norm_type = "l1_median"
                    elif norm == 2:
                        norm_type = "l2_normal"
                    else:
                        raise (ValueError(
                            "Unsupported norm {} for convex-adv".format(norm)))
                else:
                    proj = None
                    if norm == np.inf:
                        norm_type = "l1"
                    elif norm == 2:
                        norm_type = "l2"
                    else:
                        raise (ValueError(
                            "Unsupported norm {} for convex-adv".format(norm)))
                if loader.std == [1] or loader.std == [1, 1, 1]:
                    convex_eps = eps
                else:
                    convex_eps = eps / np.mean(loader.std)
                    # for CIFAR we are roughly / 0.2
                    # FIXME this is due to a bug in convex_adversarial, we cannot use per-channel eps
                if norm == np.inf:
                    # bounded input is only for Linf
                    if kwargs["bounded_input"]:
                        # FIXME the bounded projection in convex_adversarial has a bug, data range must be positive
                        assert loader.std == [1, 1, 1] or loader.std == [1]
                        data_l = 0.0
                        data_u = 1.0
                    else:
                        data_l = -np.inf
                        data_u = np.inf
                else:
                    data_l = data_u = None
                f = DualNetwork(model,
                                data,
                                convex_eps,
                                proj=proj,
                                norm_type=norm_type,
                                bounded_input=kwargs["bounded_input"],
                                data_l=data_l,
                                data_u=data_u)
                lb = f(c)
            elif kwargs["bound_type"] == "interval":
                ub, lb, relu_activity, unstable, dead, alive = model(
                    norm=norm,
                    x_U=data_ub,
                    x_L=data_lb,
                    eps=eps,
                    C=c,
                    method_opt="interval_range")
            elif kwargs["bound_type"] == "crown-full":
                _, _, lb, _ = model(norm=norm,
                                    x_U=data_ub,
                                    x_L=data_lb,
                                    eps=eps,
                                    C=c,
                                    upper=False,
                                    lower=True,
                                    method_opt="full_backward_range")
                unstable = dead = alive = relu_activity = torch.tensor([0])
            elif kwargs["bound_type"] == "crown-interval":
                # Enable multi-GPU only for the computationally expensive CROWN-IBP bounds,
                # not for regular forward propagation and IBP because the communication overhead can outweigh benefits, giving little speedup.
                ub, ilb, relu_activity, unstable, dead, alive = model(
                    norm=norm,
                    x_U=data_ub,
                    x_L=data_lb,
                    eps=eps,
                    C=c,
                    method_opt="interval_range")
                crown_final_beta = kwargs['final-beta']
                beta = (max_eps - eps * (1.0 - crown_final_beta)) / max_eps
                if beta < 1e-5:
                    lb = ilb
                else:
                    if kwargs["runnerup_only"]:
                        # regenerate a smaller c, with just the runner-up prediction
                        # mask ground truthlabel output, select the second largest class
                        # print(output)
                        # torch.set_printoptions(threshold=5000)
                        masked_output = output.detach().scatter(
                            1, labels.unsqueeze(-1), -100)
                        # print(masked_output)
                        # location of the runner up prediction
                        runner_up = masked_output.max(1)[1]
                        # print(runner_up)
                        # print(labels)
                        # get margin from the groud-truth to runner-up only
                        runnerup_c = torch.eye(num_class).type_as(data)[labels]
                        # print(runnerup_c)
                        # set the runner up location to -
                        runnerup_c.scatter_(1, runner_up.unsqueeze(-1), -1)
                        runnerup_c = runnerup_c.unsqueeze(1).detach()
                        # print(runnerup_c)
                        # get the bound for runnerup_c
                        _, _, clb, bias = model(norm=norm,
                                                x_U=data_ub,
                                                x_L=data_lb,
                                                eps=eps,
                                                C=c,
                                                method_opt="backward_range")
                        clb = clb.expand(clb.size(0), num_class - 1)
                    else:
                        # get the CROWN bound using interval bounds
                        _, _, clb, bias = model(norm=norm,
                                                x_U=data_ub,
                                                x_L=data_lb,
                                                eps=eps,
                                                C=c,
                                                method_opt="backward_range")
                        bound_bias.update(bias.sum() / data.size(0))
                    # how much better is crown-ibp better than ibp?
                    diff = (clb - ilb).sum().item()
                    bound_diff.update(diff / data.size(0), data.size(0))
                    # lb = torch.max(lb, clb)
                    lb = clb * beta + ilb * (1 - beta)
            else:
                raise RuntimeError("Unknown bound_type " +
                                   kwargs["bound_type"])
            lb = lb_s.scatter(1, sa_labels, lb)
            robust_ce = CrossEntropyLoss()(-lb, labels)
            if kwargs["bound_type"] != "convex-adv":

                relu_activities.update(
                    relu_activity.sum().detach().cpu().item() / data.size(0),
                    data.size(0))
                unstable_neurons.update(
                    unstable.sum().detach().cpu().item() / data.size(0),
                    data.size(0))
                dead_neurons.update(
                    dead.sum().detach().cpu().item() / data.size(0),
                    data.size(0))
                alive_neurons.update(
                    alive.sum().detach().cpu().item() / data.size(0),
                    data.size(0))

        if method == "robust":
            loss = robust_ce
        elif method == "robust_activity":
            loss = robust_ce + kwargs["activity_reg"] * relu_activity.sum()
        elif method == "natural":
            loss = regular_ce
        elif method == "robust_natural":
            natural_final_factor = kwargs["final-kappa"]
            kappa = (max_eps - eps * (1.0 - natural_final_factor)) / max_eps
            loss = (1 - kappa) * robust_ce + kappa * regular_ce
        else:
            raise ValueError("Unknown method " + method)

        if train and kwargs["l1_reg"] > np.finfo(np.float32).tiny:
            reg = kwargs["l1_reg"]
            l1_loss = 0.0
            for name, param in model.named_parameters():
                if 'bias' not in name:
                    l1_loss = l1_loss + torch.sum(torch.abs(param))
            l1_loss = reg * l1_loss
            loss = loss + l1_loss
            l1_losses.update(l1_loss.cpu().detach().numpy(), data.size(0))
        if train:
            loss.backward()
            if i % batch_multiplier == 0 or i == len(loader) - 1:
                opt.step()

        losses.update(loss.cpu().detach().numpy(), data.size(0))

        if verbose or method != "natural":
            robust_ce_losses.update(robust_ce.cpu().detach().numpy(),
                                    data.size(0))
            # robust_ce_losses.update(robust_ce, data.size(0))
            robust_errors.update(
                torch.sum(
                    (lb < 0).any(dim=1)).cpu().detach().numpy() / data.size(0),
                data.size(0))

        batch_time.update(time.time() - start)
        if i % 50 == 0 and train:
            logger.log(
                '[{:2d}:{:4d}]: eps {:4f}  '
                'Time {batch_time.val:.3f} ({batch_time.avg:.3f})  '
                'Total Loss {loss.val:.4f} ({loss.avg:.4f})  '
                'L1 Loss {l1_loss.val:.4f} ({l1_loss.avg:.4f})  '
                'CE {regular_ce_loss.val:.4f} ({regular_ce_loss.avg:.4f})  '
                'RCE {robust_ce_loss.val:.4f} ({robust_ce_loss.avg:.4f})  '
                'Err {errors.val:.4f} ({errors.avg:.4f})  '
                'Rob Err {robust_errors.val:.4f} ({robust_errors.avg:.4f})  '
                'Uns {unstable.val:.1f} ({unstable.avg:.1f})  '
                'Dead {dead.val:.1f} ({dead.avg:.1f})  '
                'Alive {alive.val:.1f} ({alive.avg:.1f})  '
                'Tightness {tight.val:.5f} ({tight.avg:.5f})  '
                'Bias {bias.val:.5f} ({bias.avg:.5f})  '
                'Diff {diff.val:.5f} ({diff.avg:.5f})  '
                'R {model_range:.3f}  '
                'beta {beta:.3f} ({beta:.3f})  '
                'kappa {kappa:.3f} ({kappa:.3f})  '.format(
                    t,
                    i,
                    eps,
                    batch_time=batch_time,
                    loss=losses,
                    errors=errors,
                    robust_errors=robust_errors,
                    l1_loss=l1_losses,
                    regular_ce_loss=regular_ce_losses,
                    robust_ce_loss=robust_ce_losses,
                    unstable=unstable_neurons,
                    dead=dead_neurons,
                    alive=alive_neurons,
                    tight=relu_activities,
                    bias=bound_bias,
                    diff=bound_diff,
                    model_range=model_range,
                    beta=beta,
                    kappa=kappa))
        if verbose or method != "natural":
            lb_batches.append(lb)

    logger.log('[FINAL RESULT epoch:{:2d} eps:{:.4f}]: '
               'Time {batch_time.val:.3f} ({batch_time.avg:.3f})  '
               'Total Loss {loss.val:.4f} ({loss.avg:.4f})  '
               'L1 Loss {l1_loss.val:.4f} ({l1_loss.avg:.4f})  '
               'CE {regular_ce_loss.val:.4f} ({regular_ce_loss.avg:.4f})  '
               'RCE {robust_ce_loss.val:.4f} ({robust_ce_loss.avg:.4f})  '
               'Uns {unstable.val:.3f} ({unstable.avg:.3f})  '
               'Dead {dead.val:.1f} ({dead.avg:.1f})  '
               'Alive {alive.val:.1f} ({alive.avg:.1f})  '
               'Tight {tight.val:.5f} ({tight.avg:.5f})  '
               'Bias {bias.val:.5f} ({bias.avg:.5f})  '
               'Diff {diff.val:.5f} ({diff.avg:.5f})  '
               'Err {errors.val:.4f} ({errors.avg:.4f})  '
               'Rob Err {robust_errors.val:.4f} ({robust_errors.avg:.4f})  '
               'R {model_range:.3f}  '
               'beta {beta:.3f} ({beta:.3f})  '
               'kappa {kappa:.3f} ({kappa:.3f})  \n'.format(
                   t,
                   eps,
                   batch_time=batch_time,
                   loss=losses,
                   errors=errors,
                   robust_errors=robust_errors,
                   l1_loss=l1_losses,
                   regular_ce_loss=regular_ce_losses,
                   robust_ce_loss=robust_ce_losses,
                   unstable=unstable_neurons,
                   dead=dead_neurons,
                   alive=alive_neurons,
                   tight=relu_activities,
                   bias=bound_bias,
                   diff=bound_diff,
                   model_range=model_range,
                   kappa=kappa,
                   beta=beta))
    for i, l in enumerate(
            model if isinstance(model, BoundSequential) else model.module):
        if isinstance(l, BoundLinear) or isinstance(l, BoundConv2d):
            norm = l.weight.data.detach().view(l.weight.size(0),
                                               -1).abs().sum(1).max().cpu()
            logger.log('layer {} norm {}'.format(i, norm))
    if method == "natural":
        return errors.avg, errors.avg, lb_batches
    else:
        return robust_errors.avg, errors.avg, lb_batches
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for computing the token classification loss.
            Indices should be in ``[0, ..., config.num_labels - 1]``.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]

        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            # Only keep active parts of the loss
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)
                active_labels = torch.where(
                    active_loss, labels.view(-1),
                    torch.tensor(loss_fct.ignore_index).type_as(labels))
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels),
                                labels.view(-1))

        if not return_dict:
            output = (logits, ) + outputs[2:]
            return ((loss, ) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )
    def forward(
        self,
        input_ids=None,
        past=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        mc_token_ids=None,
        lm_labels=None,
        mc_labels=None,
    ):
        r"""
        mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
            Index of the classification token in each input sequence.
            Selected in the range ``[0, input_ids.size(-1) - 1[``.
        lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
            Labels for language modeling.
            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
            All labels set to ``-100`` are ignored (masked), the loss is only
            computed for labels in ``[0, ..., config.vocab_size]``
        mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
            Labels for computing the multiple choice classification loss.
            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
            of the input tensors. (see `input_ids` above)

    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
        lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``lm_labels`` is provided):
            Language modeling loss.
        mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`multiple_choice_labels` is provided):
            Multiple choice classification loss.
        lm_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        mc_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import torch
        from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel

        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = GPT2DoubleHeadsModel.from_pretrained('gpt2')

        # Add a [CLS] to the vocabulary (we should train it also!)
        tokenizer.add_special_tokens({'cls_token': '[CLS]'})
        model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
        print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary

        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
        encoded_choices = [tokenizer.encode(s) for s in choices]
        cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]

        input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
        mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1

        outputs = model(input_ids, mc_token_ids=mc_token_ids)
        lm_prediction_scores, mc_prediction_scores = outputs[:2]

        """
        transformer_outputs = self.transformer(
            input_ids,
            past=past,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        hidden_states = transformer_outputs[0]

        lm_logits = self.lm_head(hidden_states)
        mc_logits = self.multiple_choice_head(hidden_states,
                                              mc_token_ids).squeeze(-1)

        outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
        if mc_labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
                            mc_labels.view(-1))
            outputs = (loss, ) + outputs
        if lm_labels is not None:
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = lm_labels[..., 1:].contiguous()
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                            shift_labels.view(-1))
            outputs = (loss, ) + outputs

        return outputs  # (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions)
示例#25
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
    )
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " + ", ".join(MODEL_TYPES),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS),
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--labels",
        default="",
        type=str,
        help=
        "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
    )
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_predict",
                        action="store_true",
                        help="Whether to run predictions on the test set.")
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Whether to run evaluation during training at each logging step.",
    )
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--keep_accents",
                        action="store_const",
                        const=True,
                        help="Set this flag if model is trained with accents.")
    parser.add_argument(
        "--strip_accents",
        action="store_const",
        const=True,
        help="Set this flag if model is trained without accents.")
    parser.add_argument("--use_fast",
                        action="store_const",
                        const=True,
                        help="Set this flag to use fast tokenization.")
    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument("--logging_steps",
                        type=int,
                        default=500,
                        help="Log every X updates steps.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=500,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="For distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="For distant debugging.")
    args = parser.parse_args()

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Prepare CONLL-2003 task
    labels = get_labels(args.labels)
    num_labels = len(labels)
    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
    pad_token_label_id = CrossEntropyLoss().ignore_index

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config = AutoConfig.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        id2label={str(i): label
                  for i, label in enumerate(labels)},
        label2id={label: i
                  for i, label in enumerate(labels)},
        cache_dir=args.cache_dir if args.cache_dir else None,
    )
    tokenizer_args = {
        k: v
        for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS
    }
    logger.info("Tokenizer arguments: %s", tokenizer_args)
    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        cache_dir=args.cache_dir if args.cache_dir else None,
        **tokenizer_args,
    )
    model = AutoModelForTokenClassification.from_pretrained(
        args.model_name_or_path,
        from_tf=bool(".ckpt" in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None,
    )

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                labels,
                                                pad_token_label_id,
                                                mode="train")
        global_step, tr_loss = train(args, train_dataset, model, tokenizer,
                                     labels, pad_token_label_id)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (model.module if hasattr(model, "module") else model
                         )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir,
                                                  **tokenizer_args)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                "-")[-1] if len(checkpoints) > 1 else ""
            model = AutoModelForTokenClassification.from_pretrained(checkpoint)
            model.to(args.device)
            result, _ = evaluate(args,
                                 model,
                                 tokenizer,
                                 labels,
                                 pad_token_label_id,
                                 mode="dev",
                                 prefix=global_step)
            if global_step:
                result = {
                    "{}_{}".format(global_step, k): v
                    for k, v in result.items()
                }
            results.update(result)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            for key in sorted(results.keys()):
                writer.write("{} = {}\n".format(key, str(results[key])))

    if args.do_predict and args.local_rank in [-1, 0]:
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir,
                                                  **tokenizer_args)
        model = AutoModelForTokenClassification.from_pretrained(
            args.output_dir)
        model.to(args.device)
        result, predictions = evaluate(args,
                                       model,
                                       tokenizer,
                                       labels,
                                       pad_token_label_id,
                                       mode="test")
        # Save results
        output_test_results_file = os.path.join(args.output_dir,
                                                "test_results.txt")
        with open(output_test_results_file, "w") as writer:
            for key in sorted(result.keys()):
                writer.write("{} = {}\n".format(key, str(result[key])))
        # Save predictions
        output_test_predictions_file = os.path.join(args.output_dir,
                                                    "test_predictions.txt")
        with open(output_test_predictions_file, "w") as writer:
            with open(os.path.join(args.data_dir, "test.txt"), "r") as f:
                example_id = 0
                for line in f:
                    if line.startswith(
                            "-DOCSTART-") or line == "" or line == "\n":
                        writer.write(line)
                        if not predictions[example_id]:
                            example_id += 1
                    elif predictions[example_id]:
                        output_line = line.split(
                        )[0] + " " + predictions[example_id].pop(0) + "\n"
                        writer.write(output_line)
                    else:
                        logger.warning(
                            "Maximum sequence length exceeded: No prediction for '%s'.",
                            line.split()[0])

    return results
    def forward(
        self,
        input_ids=None,
        past=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for language modeling.
            Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
            All labels set to ``-100`` are ignored (masked), the loss is only
            computed for labels in ``[0, ..., config.vocab_size]``

    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
            Language modeling loss.
        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        """
        transformer_outputs = self.transformer(
            input_ids,
            past=past,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )

        hidden_states = transformer_outputs[0]

        lm_logits = self.lm_head(hidden_states)

        outputs = (lm_logits,) + transformer_outputs[1:]

        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            outputs = (loss,) + outputs

        return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)
示例#27
0
    def __init__(self,
                 criterion=None,
                 lr: float = 0.001,
                 momentum=0.9,
                 l2=0.0005,
                 train_epochs: int = 4,
                 init_update_rate: float = 0.01,
                 inc_update_rate=0.00005,
                 max_r_max=1.25,
                 max_d_max=0.5,
                 inc_step=4.1e-05,
                 rm_sz: int = 1500,
                 freeze_below_layer: str = "lat_features.19.bn.beta",
                 latent_layer_num: int = 19,
                 ewc_lambda: float = 0,
                 train_mb_size: int = 128,
                 eval_mb_size: int = 128,
                 device=None,
                 plugins: Optional[Sequence[StrategyPlugin]] = None,
                 evaluator: EvaluationPlugin = default_logger,
                 eval_every=-1):
        """
        Creates an instance of the AR1 strategy.

        :param criterion: The loss criterion to use. Defaults to None, in which
            case the cross entropy loss is used.
        :param lr: The learning rate (SGD optimizer).
        :param momentum: The momentum (SGD optimizer).
        :param l2: The L2 penalty used for weight decay.
        :param train_epochs: The number of training epochs. Defaults to 4.
        :param init_update_rate: The initial update rate of BatchReNorm layers.
        :param inc_update_rate: The incremental update rate of BatchReNorm
            layers.
        :param max_r_max: The maximum r value of BatchReNorm layers.
        :param max_d_max: The maximum d value of BatchReNorm layers.
        :param inc_step: The incremental step of r and d values of BatchReNorm
            layers.
        :param rm_sz: The size of the replay buffer. The replay buffer is shared
            across classes. Defaults to 1500.
        :param freeze_below_layer: A string describing the name of the layer
            to use while freezing the lower (nearest to the input) part of the
            model. The given layer is not frozen (exclusive).
        :param latent_layer_num: The number of the layer to use as the Latent
            Replay Layer. Usually this is the same of `freeze_below_layer`.
        :param ewc_lambda: The Synaptic Intelligence lambda term. Defaults to
            0, which means that the Synaptic Intelligence regularization
            will not be applied.
        :param train_mb_size: The train minibatch size. Defaults to 128.
        :param eval_mb_size: The eval minibatch size. Defaults to 128.
        :param device: The device to use. Defaults to None (cpu).
        :param plugins: (optional) list of StrategyPlugins.
        :param evaluator: (optional) instance of EvaluationPlugin for logging
            and metric computations.
        :param eval_every: the frequency of the calls to `eval` inside the
            training loop.
                if -1: no evaluation during training.
                if  0: calls `eval` after the final epoch of each training
                    experience.
                if >0: calls `eval` every `eval_every` epochs and at the end
                    of all the epochs for a single experience.
        """

        warnings.warn("The AR1 strategy implementation is in an alpha stage "
                      "and is not perfectly aligned with the paper "
                      "implementation. Please use at your own risk!")

        if plugins is None:
            plugins = []

        # Model setup
        model = MobilenetV1(pretrained=True, latent_layer_num=latent_layer_num)
        replace_bn_with_brn(model,
                            momentum=init_update_rate,
                            r_d_max_inc_step=inc_step,
                            max_r_max=max_r_max,
                            max_d_max=max_d_max)

        fc_name, fc_layer = get_last_fc_layer(model)

        if ewc_lambda != 0:
            # Synaptic Intelligence is not applied to the last fully
            # connected layer (and implicitly to "freeze below" ones.
            plugins.append(
                SynapticIntelligencePlugin(ewc_lambda,
                                           excluded_parameters=[fc_name]))

        self.cwr_plugin = CWRStarPlugin(model,
                                        cwr_layer_name=fc_name,
                                        freeze_remaining_model=False)
        plugins.append(self.cwr_plugin)

        optimizer = SGD(model.parameters(),
                        lr=lr,
                        momentum=momentum,
                        weight_decay=l2)

        if criterion is None:
            criterion = CrossEntropyLoss()

        self.ewc_lambda = ewc_lambda
        self.freeze_below_layer = freeze_below_layer
        self.rm_sz = rm_sz
        self.inc_update_rate = inc_update_rate
        self.max_r_max = max_r_max
        self.max_d_max = max_d_max
        self.lr = lr
        self.momentum = momentum
        self.l2 = l2
        self.rm = None
        self.cur_acts: Optional[Tensor] = None
        self.replay_mb_size = 0

        super().__init__(model,
                         optimizer,
                         criterion,
                         train_mb_size=train_mb_size,
                         train_epochs=train_epochs,
                         eval_mb_size=eval_mb_size,
                         device=device,
                         plugins=plugins,
                         evaluator=evaluator,
                         eval_every=eval_every)
示例#28
0
    def forward(self,
                input_ids,
                token_type_ids=None,
                attention_mask=None,
                lus=None,
                senses=None,
                args=None,
                using_gold_fame=False,
                position_ids=None,
                head_mask=None):
        sequence_output, pooled_output = self.bert(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask)
        sequence_output = self.dropout(sequence_output)
        pooled_output = self.dropout(pooled_output)

        #         with torch.no_grad():
        #             pooled_output = self.mlp_model(pooled_output)
        #         self.mlp_model.train()
        #         self.mlp_model.eval()

        #         pooled_output = self.mlp_model(pooled_output)

        sense_logits = self.sense_classifier(pooled_output)
        arg_logits = self.arg_classifier(sequence_output)

        lufr_masks = utils.get_masks(lus,
                                     self.lufrmap,
                                     num_label=self.num_senses,
                                     masking=self.masking).to(device)

        sense_loss = 0  # loss for sense id
        arg_loss = 0  # loss for arg id

        if senses is not None:
            for i in range(len(sense_logits)):
                sense_logit = sense_logits[i]
                arg_logit = arg_logits[i]

                lufr_mask = lufr_masks[i]

                gold_sense = senses[i]
                gold_arg = args[i]

                #train sense classifier
                loss_fct_sense = CrossEntropyLoss(weight=lufr_mask)
                loss_per_seq_for_sense = loss_fct_sense(
                    sense_logit.view(-1, self.num_senses), gold_sense.view(-1))
                sense_loss += loss_per_seq_for_sense

                #train arg classifier
                masked_sense_logit = utils.masking_logit(
                    sense_logit, lufr_mask)
                pred_sense, sense_score = utils.logit2label(masked_sense_logit)

                frarg_mask = utils.get_masks([pred_sense],
                                             self.frargmap,
                                             num_label=self.num_args,
                                             masking=True).to(device)[0]
                loss_fct_arg = CrossEntropyLoss(weight=frarg_mask)

                # only keep active parts of loss
                if attention_mask is not None:
                    active_loss = attention_mask[i].view(-1) == 1
                    active_logits = arg_logit.view(-1,
                                                   self.num_args)[active_loss]
                    active_labels = gold_arg.view(-1)[active_loss]
                    loss_per_seq_for_arg = loss_fct_arg(
                        active_logits, active_labels)
                else:
                    loss_per_seq_for_arg = loss_fct_arg(
                        arg_logit.view(-1, self.num_args), gold_arg.view(-1))
                arg_loss += loss_per_seq_for_arg

            # 0.5 weighted loss
#             if self.original_loss:
#                 loss = (sense_loss, arg_loss)
#             else:
            total_loss = 0.5 * sense_loss + 0.5 * arg_loss
            loss = total_loss / len(sense_logits)

            if self.return_pooled_output:
                return pooled_output, loss
            else:
                return loss
        else:
            if self.return_pooled_output:
                return pooled_output, sense_logits, arg_logits
            else:
                return sense_logits, arg_logits
示例#29
0
def main():
    argument_parser = argparse.ArgumentParser()
    argument_parser.add_argument("--path_to_train_data",
                                 type=str,
                                 required=True)
    argument_parser.add_argument("--path_to_eval_data",
                                 type=str,
                                 required=False,
                                 default=None)
    argument_parser.add_argument("--n_epochs",
                                 type=int,
                                 required=False,
                                 default=3)
    argument_parser.add_argument("--batch_size",
                                 type=int,
                                 required=False,
                                 default=32)
    argument_parser.add_argument("--bptt",
                                 type=int,
                                 required=False,
                                 default=64)
    argument_parser.add_argument("--lr",
                                 type=float,
                                 required=False,
                                 default=0.0001)
    argument_parser.add_argument("--vocabulary_size",
                                 type=int,
                                 required=False,
                                 default=20000)
    argument_parser.add_argument("--embedding_dimension",
                                 type=int,
                                 required=False,
                                 default=300)
    argument_parser.add_argument("--hidden_units_for_lstm",
                                 type=int,
                                 required=False,
                                 default=256)
    argument_parser.add_argument("--num_of_lstm_layer",
                                 type=int,
                                 required=False,
                                 default=1)
    argument_parser.add_argument("--n_decoder_blocks",
                                 type=int,
                                 required=False,
                                 default=5)

    arguments = argument_parser.parse_args()

    train_language_modeling_dataset = LanguageModelingDataset(
        arguments.batch_size, arguments.bptt)
    train_language_modeling_dataset.set_tokenizer(ByteLevelBPETokenizer())
    train_language_modeling_dataset.fit(
        arguments.path_to_train_data,
        vocabulary_size=arguments.vocabulary_size)

    train_language_modeling_dataloader = LanguageModelingDataLoader(
        arguments.bptt,
        train_language_modeling_dataset.transform(arguments.path_to_train_data,
                                                  return_target=True),
    )

    model = LSTMModel(
        arguments.vocabulary_size,
        arguments.embedding_dimension,
        arguments.hidden_units_for_lstm,
        arguments.n_decoder_blocks,
        arguments.num_of_lstm_layer,
    )

    logger = TensorboardLogger()
    trainer = Trainer(arguments.batch_size)
    trainer.set_logger(logger)

    if arguments.path_to_eval_data:
        eval_language_modeling_dataloader = LanguageModelingDataLoader(
            arguments.bptt,
            train_language_modeling_dataset.transform(
                arguments.path_to_eval_data, return_target=True),
        )

        trainer.train(
            model,
            train_language_modeling_dataloader,
            CrossEntropyLoss(),
            Adam(model.parameters(), arguments.lr),
            eval_language_modeling_dataloader,
            arguments.n_epochs,
        )

    else:
        trainer.train(
            model,
            train_language_modeling_dataloader,
            CrossEntropyLoss(),
            Adam(model.parameters(), arguments.lr),
            None,
            arguments.n_epochs,
        )

    logger.log_params(vars(arguments), trainer.losses)
    saver = Saver(logger.log_dir())
    saver.save_preprocessor_and_model(train_language_modeling_dataset, model)
示例#30
0
    def forward(
        self,
        input_ids=None,
        first_check=None,
        position=None,
        past=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
            Labels for language modeling.
            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
            All labels set to ``-100`` are ignored (masked), the loss is only
            computed for labels in ``[0, ..., config.vocab_size]``

    Return:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
            Language modeling loss.
        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`):
            Contains pre-computed hidden-states (key and values in the attention blocks).
            Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model
            should not be passed as input ids as they have already been computed.
        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
            of shape :obj:`(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.

    Examples::

        import torch
        from transformers import GPT2Tokenizer, GPT2LMHeadModel

        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        model = GPT2LMHeadModel.from_pretrained('gpt2')

        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
        outputs = model(input_ids, labels=input_ids)
        loss, logits = outputs[:2]

        """
        transformer_outputs, position = self.transformer(
            input_ids,
            first_check,
            position,
            past=past,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )
        return transformer_outputs, position

        hidden_states = transformer_outputs[0]

        lm_logits = self.lm_head(hidden_states)

        outputs = (lm_logits, ) + transformer_outputs[1:]
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = lm_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                            shift_labels.view(-1))
            outputs = (loss, ) + outputs

        return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)