import torch from collections import OrderedDict from functools import partial from torch.autograd import Variable from torch.nn import CrossEntropyLoss, Module from torch.optim import SGD from .utils import add_metrics_to_log, get_loader, log_to_message, ProgressBar DEFAULT_LOSS = CrossEntropyLoss() DEFAULT_OPTIMIZER = partial(SGD, lr=0.001, momentum=0.9) class FitModule(Module): def fit(self, X, y, batch_size=32, epochs=1, verbose=1, validation_split=0., validation_data=None, shuffle=True, initial_epoch=0, seed=None, loss=DEFAULT_LOSS, optimizer=DEFAULT_OPTIMIZER, metrics=None): """Trains the model similar to Keras' .fit(...) method
def forward( self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_tuple=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` """ return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple transformer_outputs = self.transformer( input_ids, past=past, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_tuple=return_tuple, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) loss = None if labels is not None: # Shift so that tokens < n predict n shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) if return_tuple: output = (lm_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output return CausalLMOutputWithPast( loss=loss, logits=lm_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, )
self.linear_layers = Sequential(Linear(196, 10)) # Defining the forward pass def forward(self, x): x = self.cnn_layers(x.float()).float() x = x.view(x.size(0), -1) x = self.linear_layers(x) return x model = Net() # defining the optimizer optimizer = Adam(model.parameters(), lr=0.07) # defining the loss function criterion = CrossEntropyLoss() # checking if GPU is available if False: #torch.cuda.is_available(): model = model.cuda() criterion = criterion.cuda() print(model) def train(epoch): model.train() tr_loss = 0 # getting the training set x_train, y_train = Variable(train_x), Variable(train_y) # getting the validation set x_val, y_val = Variable(val_x), Variable(val_y)
def forward( self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None, position_ids=None, head_mask=None, inputs_embeds=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[ 1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view( -1, input_ids.size(-1)) if input_ids is not None else None flat_position_ids = position_ids.view( -1, position_ids.size(-1)) if position_ids is not None else None flat_token_type_ids = token_type_ids.view( -1, token_type_ids.size(-1)) if token_type_ids is not None else None flat_attention_mask = attention_mask.view( -1, attention_mask.size(-1)) if attention_mask is not None else None flat_inputs_embeds = (inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) if inputs_embeds is not None else None) outputs = self.roberta( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, attention_mask=flat_attention_mask, head_mask=head_mask, inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) loss = None if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) if not return_dict: output = (reshaped_logits, ) + outputs[2:] return ((loss, ) + output) if loss is not None else output return MultipleChoiceModelOutput( loss=loss, logits=reshaped_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )
# Model logging.info('==> Building model..') #currently fold only for resnet18 for HW implementation if args.fold: assert (args.model == 'resnet18') args.model += 'a' modelClass = Models.__dict__[args.model] model = modelClass(args) # Load preTrained weights. logging.info('==> Resuming from checkpoint..') model.loadPreTrained() model = model.cuda() criterion = CrossEntropyLoss().cuda() run = Run(model, logging, criterion) # log command line logging.info('CommandLine: {} PID: {} ' 'Hostname: {} CUDA_VISIBLE_DEVICES {}'.format( argv, getpid(), gethostname(), environ.get('CUDA_VISIBLE_DEVICES'))) # Weights quantization if args.weightBitwidth < 32 and not args.fold: model_path = './qmodels' if not os.path.exists(model_path): os.makedirs(model_path) model_path = os.path.join(
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) total_loss = None if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 if not return_dict: output = (start_logits, end_logits) + outputs[2:] return ((total_loss, ) + output) if total_loss is not None else output return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, **kwargs): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): Used to hide legacy arguments that have been deprecated. """ if "masked_lm_labels" in kwargs: warnings.warn( "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", FutureWarning, ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) masked_lm_loss = None if labels is not None: #loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss(ignore_index=-1) masked_lm_loss = loss_fct( prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) if not return_dict: output = (prediction_scores, ) + outputs[2:] return ((masked_lm_loss, ) + output) if masked_lm_loss is not None else output return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )
def forward( self, input_values, attention_mask=None, output_attentions=None, output_hidden_states=None, return_dict=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states outputs = self.sew( input_values, attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) if self.config.use_weighted_layer_sum: hidden_states = outputs[_HIDDEN_STATES_START_POSITION] hidden_states = torch.stack(hidden_states, dim=1) norm_weights = nn.functional.softmax(self.layer_weights, dim=-1) hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1) else: hidden_states = outputs[0] hidden_states = self.projector(hidden_states) if attention_mask is None: pooled_output = hidden_states.mean(dim=1) else: padding_mask = self._get_feature_vector_attention_mask( hidden_states.shape[1], attention_mask) hidden_states[~padding_mask] = 0.0 pooled_output = hidden_states.sum(dim=1) / padding_mask.sum( dim=1).view(-1, 1) logits = self.classifier(pooled_output) loss = None if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) if not return_dict: output = (logits, ) + outputs[_HIDDEN_STATES_START_POSITION:] return ((loss, ) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )
def loss(self): """ """ return CrossEntropyLoss()
def forward(self, beam_size=1, cls_ids=None, input_ids=None, attention_mask=None, token_type_ids=None, input_mask=None, position_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, p_mask=None, global_attention_mask=None): # set global attention on question tokens if global_attention_mask is None: # logger.info("Initializing global attention on question tokens...") # put global attention on all tokens until `config.sep_token_id` is reached # global_attention_mask = _compute_global_attention_mask(input_ids, self.config.sep_token_id) global_attention_mask = p_mask outputs = self.longformer( input_ids, global_attention_mask=global_attention_mask, attention_mask=attention_mask, # token_type_ids=None, position_ids=position_ids, inputs_embeds=inputs_embeds, ) hidden_states = outputs[0] start_logits = self.start_logits(hidden_states, p_mask=p_mask) outputs = outputs[ 1:] # Keep mems, hidden states, attentions if there are in it if start_positions is not None and end_positions is not None: # If we are on multi-GPU, let's remove the dimension added by batch splitting for x in (start_positions, end_positions): if x is not None and x.dim() > 1: x.squeeze_(-1) # during training, compute the end logits based on the ground truth of the start position end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask) loss_fct = CrossEntropyLoss() start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = total_loss else: # during inference, compute the end logits based on beam search bsz, slen, hsz = hidden_states.size() start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen) # start_log_probs = F.sigmoid(start_logits) start_top_log_probs, start_top_index = torch.topk( start_log_probs, beam_size, dim=-1) # shape (bsz, start_n_top) start_top_index_exp = start_top_index.unsqueeze(-1).expand( -1, -1, hsz) # shape (bsz, start_n_top, hsz) start_states = torch.gather( hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) start_states = start_states.unsqueeze(1).expand( -1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) hidden_states_expanded = hidden_states.unsqueeze(2).expand_as( start_states) # shape (bsz, slen, start_n_top, hsz) p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) # end_log_probs = F.sigmoid(end_logits) end_top_log_probs, end_top_index = torch.topk( end_log_probs, beam_size, dim=1) # shape (bsz, end_n_top, start_n_top) end_top_log_probs = end_top_log_probs.view(-1, beam_size * beam_size) end_top_index = end_top_index.view(-1, beam_size * beam_size) outputs = start_top_log_probs, start_top_index, end_top_log_probs, end_top_index return outputs
def forward(self, beam_size=1, cls_ids=None, input_ids=None, attention_mask=None, token_type_ids=None, input_mask=None, position_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, p_mask=None): outputs = self.model( input_ids, attention_mask=attention_mask, ) hidden_states = outputs[1][-3] # hidden_states = outputs[0] # print(hidden_states.shape) # hidden_states = torch.cat((outputs[2][-1],outputs[2][-2], outputs[2][-3], outputs[2][-4]),-1) start_logits = self.start_logits(hidden_states, p_mask=p_mask) outputs = outputs[ 1:] # Keep mems, hidden states, attentions if there are in it if start_positions is not None and end_positions is not None: # If we are on multi-GPU, let's remove the dimension added by batch splitting for x in (start_positions, end_positions): if x is not None and x.dim() > 1: x.squeeze_(-1) # during training, compute the end logits based on the ground truth of the start position end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask) loss_fct = CrossEntropyLoss() start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = total_loss else: # during inference, compute the end logits based on beam search bsz, slen, hsz = hidden_states.size() start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen) start_top_log_probs, start_top_index = torch.topk( start_log_probs, beam_size, dim=-1) # shape (bsz, start_n_top) start_top_index_exp = start_top_index.unsqueeze(-1).expand( -1, -1, hsz) # shape (bsz, start_n_top, hsz) start_states = torch.gather( hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) start_states = start_states.unsqueeze(1).expand( -1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) hidden_states_expanded = hidden_states.unsqueeze(2).expand_as( start_states) # shape (bsz, slen, start_n_top, hsz) p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) end_top_log_probs, end_top_index = torch.topk( end_log_probs, beam_size, dim=1) # shape (bsz, end_n_top, start_n_top) end_top_log_probs = end_top_log_probs.view(-1, beam_size * beam_size) end_top_index = end_top_index.view(-1, beam_size * beam_size) outputs = start_top_log_probs, start_top_index, end_top_log_probs, end_top_index return outputs
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) # batch_size为总batch_size除以每个节点的显卡数 opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 model = generate_model(opt) # 加载模型,在model.py中修改加载的模型 if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.model, opt.n_finetune_classes) # 如果预训练,则加载训练好的模型,微调使用pretrained_path if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) # 如果从中断的某个模型开始,测试使用resume_path model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) criterion = CrossEntropyLoss().to(opt.device) # 交叉熵损失函数 if not opt.no_train: (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, scheduler) = get_train_utils(opt, parameters) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None prev_val_loss = None for i in range(opt.begin_epoch, opt.n_epochs + 1): # 开始训练 if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) train_epoch(i, train_loader, model, criterion, optimizer, opt.device, current_lr, train_logger, train_batch_logger, tb_writer, opt.distributed) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_val: prev_val_loss = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, valid_mask=None, start_positions=None, end_positions=None): outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds) sequence_output = outputs[0] sequence_output, attention_mask = valid_sequence_output( sequence_output, valid_mask, attention_mask) sequence_output = self.dropout(sequence_output) start_logits = self.start_fc(sequence_output) if start_positions is not None and self.training: if self.soft_label: batch_size = input_ids.size(0) seq_len = input_ids.size(1) label_logits = torch.FloatTensor(batch_size, seq_len, self.num_labels) label_logits.zero_() label_logits = label_logits.to(input_ids.device) label_logits.scatter_(2, start_positions.unsqueeze(2), 1) else: label_logits = start_positions.unsqueeze(2).float() else: label_logits = F.softmax(start_logits, -1) if not self.soft_label: label_logits = torch.argmax(label_logits, -1).unsqueeze(2).float() end_logits = self.end_fc(sequence_output, label_logits) outputs = ( start_logits, end_logits, ) + outputs[2:] if start_positions is not None and end_positions is not None: assert self.loss_type in ['lsr', 'focal', 'ce'] if self.loss_type == 'lsr': loss_fct = LabelSmoothingCrossEntropy() elif self.loss_type == 'focal': loss_fct = FocalLoss() else: loss_fct = CrossEntropyLoss() start_logits = start_logits.view(-1, self.num_labels) end_logits = end_logits.view(-1, self.num_labels) active_loss = attention_mask.view(-1) == 1 active_start_logits = start_logits[active_loss] active_end_logits = end_logits[active_loss] active_start_labels = start_positions.view(-1)[active_loss] active_end_labels = end_positions.view(-1)[active_loss] start_loss = loss_fct(active_start_logits, active_start_labels) end_loss = loss_fct(active_end_logits, active_end_labels) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss, ) + outputs return outputs
def main_worker(index, opt): random.seed(opt.manual_seed) np.random.seed(opt.manual_seed) torch.manual_seed(opt.manual_seed) if index >= 0 and opt.device.type == 'cuda': opt.device = torch.device(f'cuda:{index}') if opt.distributed: opt.dist_rank = opt.dist_rank * opt.ngpus_per_node + index dist.init_process_group(backend='nccl', init_method=opt.dist_url, world_size=opt.world_size, rank=opt.dist_rank) opt.batch_size = int(opt.batch_size / opt.ngpus_per_node) opt.n_threads = int( (opt.n_threads + opt.ngpus_per_node - 1) / opt.ngpus_per_node) opt.is_master_node = not opt.distributed or opt.dist_rank == 0 if opt.inference: model = generate_model(opt) else: model = generate_model(opt, use_features=True) if opt.batchnorm_sync: assert opt.distributed, 'SyncBatchNorm only supports DistributedDataParallel.' model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if opt.pretrain_path: model = load_pretrained_model(model, opt.pretrain_path, opt.n_finetune_classes) if opt.resume_path is not None: model = resume_model(opt.resume_path, opt.arch, model) model = make_data_parallel(model, opt.distributed, opt.device) if opt.pretrain_path: parameters = get_fine_tuning_parameters(model, opt.ft_begin_module) else: parameters = model.parameters() if opt.is_master_node: print(model) ##################################################################################### ### here add a classifier to predict videos and audios if opt.inference is False: ### define loss criterion = CrossEntropyLoss().to(opt.device) if opt.use_audio or opt.use_image: criterion_jsd = JSDLoss(weight=0.5) ################################################################################# if opt.use_audio: ### define loss criterion_ct_av = NCELoss(temperature=0.5) ### audio teacher model feature_dim = 512 * 2 if opt.pretrain_path is not None: joint_prediction_aud = generate_prediction( feature_dim, opt.n_finetune_classes, normalization=True) else: joint_prediction_aud = generate_prediction(feature_dim, opt.n_classes, normalization=True) if opt.resume_path is not None: aux_checkpoint = Path( os.path.join(str(opt.resume_path.parent), str(opt.resume_path.name[:-4] + '_audio.pth'))) joint_prediction_aud = resume_model(aux_checkpoint, opt.arch, joint_prediction_aud) joint_prediction_aud = make_data_parallel(joint_prediction_aud, opt.distributed, opt.device) aud_para = joint_prediction_aud.parameters() joint_prediction_aud.cuda() else: aud_para = None ################################################################################# if opt.use_image: ### define loss criterion_ct_iv = NCELoss(temperature=0.1) ### image teacher model image_model = torchvision.models.resnet34(pretrained=True) # remove the fc layers (only use the image features) image_model = torch.nn.Sequential( *list(image_model.children())[:-1]) image_model = make_data_parallel(image_model, opt.distributed, opt.device) feature_dim = 512 * 2 if opt.pretrain_path is not None: joint_prediction_img = generate_prediction( feature_dim, opt.n_finetune_classes, normalization=True) else: joint_prediction_img = generate_prediction(feature_dim, opt.n_classes, normalization=True) if opt.resume_path is not None: aux_checkpoint = Path( os.path.join(str(opt.resume_path.parent), str(opt.resume_path.name[:-4] + '_image.pth'))) joint_prediction_img = resume_model(aux_checkpoint, opt.arch, joint_prediction_img) joint_prediction_img = make_data_parallel(joint_prediction_img, opt.distributed, opt.device) img_para = joint_prediction_img.parameters() joint_prediction_img.cuda() else: img_para = None ################################################################################# (train_loader, train_sampler, train_logger, train_batch_logger, optimizer, optimizer_av, optimizer_iv, scheduler) = \ get_train_utils(opt, model_parameters=parameters, av_parameters=aud_para, iv_parameters=img_para) if opt.resume_path is not None: opt.begin_epoch, optimizer, scheduler = resume_train_utils( opt.resume_path, opt.begin_epoch, optimizer, scheduler) if opt.overwrite_milestones: scheduler.milestones = opt.multistep_milestones if not opt.no_val: val_loader, val_logger = get_val_utils(opt) if opt.tensorboard and opt.is_master_node: from torch.utils.tensorboard import SummaryWriter if opt.begin_epoch == 1: tb_writer = SummaryWriter(log_dir=opt.result_path) else: tb_writer = SummaryWriter(log_dir=opt.result_path, purge_step=opt.begin_epoch) else: tb_writer = None prev_val_loss = None pre_val_acc = 0.0 if opt.image_size > opt.sample_size: image_size = opt.image_size else: image_size = None for i in range(opt.begin_epoch, opt.n_epochs + 1): if not opt.no_train: if opt.distributed: train_sampler.set_epoch(i) current_lr = get_lr(optimizer) if optimizer_av is None and optimizer_iv is None: train_epoch(epoch=i, data_loader=train_loader, model=model, criterion=criterion, optimizer=optimizer, device=opt.device, current_lr=current_lr, epoch_logger=train_logger, batch_logger=train_batch_logger, tb_writer=tb_writer, distributed=opt.distributed) elif optimizer_av is not None and optimizer_iv is None: train_a_epoch(epoch=i, data_loader=train_loader, model=model, joint_prediction_aud=joint_prediction_aud, criterion=criterion, criterion_jsd=criterion_jsd, criterion_ct_av=criterion_ct_av, optimizer=optimizer, optimizer_av=optimizer_av, device=opt.device, current_lr=current_lr, epoch_logger=train_logger, batch_logger=train_batch_logger, tb_writer=tb_writer, distributed=opt.distributed) elif optimizer_av is None and optimizer_iv is not None: train_i_epoch(epoch=i, data_loader=train_loader, model=model, image_model=image_model, joint_prediction_img=joint_prediction_img, criterion=criterion, criterion_jsd=criterion_jsd, criterion_ct_iv=criterion_ct_iv, optimizer=optimizer, optimizer_iv=optimizer_iv, device=opt.device, current_lr=current_lr, epoch_logger=train_logger, batch_logger=train_batch_logger, tb_writer=tb_writer, distributed=opt.distributed, image_size=image_size) else: train_ai_epoch(epoch=i, data_loader=train_loader, model=model, image_model=image_model, joint_prediction_aud=joint_prediction_aud, joint_prediction_img=joint_prediction_img, criterion=criterion, criterion_jsd=criterion_jsd, criterion_ct_av=criterion_ct_av, criterion_ct_iv=criterion_ct_iv, optimizer=optimizer, optimizer_av=optimizer_av, optimizer_iv=optimizer_iv, device=opt.device, current_lr=current_lr, epoch_logger=train_logger, batch_logger=train_batch_logger, tb_writer=tb_writer, distributed=opt.distributed, image_size=image_size, loss_weight=opt.loss_weight) if i % opt.checkpoint == 0 and opt.is_master_node: save_file_path = opt.result_path / 'save_{}.pth'.format(i) save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if opt.use_audio: save_file_path = opt.result_path / 'save_{}_audio.pth'.format( i) save_checkpoint(save_file_path, i, opt.arch, joint_prediction_aud, optimizer, scheduler) if opt.use_image: save_file_path = opt.result_path / 'save_{}_image.pth'.format( i) save_checkpoint(save_file_path, i, opt.arch, joint_prediction_img, optimizer, scheduler) if not opt.no_val and i % opt.val_freq == 0: prev_val_loss, val_acc = val_epoch(i, val_loader, model, criterion, opt.device, val_logger, tb_writer, opt.distributed) if pre_val_acc < val_acc: pre_val_acc = val_acc save_file_path = opt.result_path / 'save_model.pth' save_checkpoint(save_file_path, i, opt.arch, model, optimizer, scheduler) if not opt.no_train and opt.lr_scheduler == 'multistep': scheduler.step() elif not opt.no_train and opt.lr_scheduler == 'plateau': if prev_val_loss is not None: scheduler.step(prev_val_loss) if opt.inference: inference_loader, inference_class_names = get_inference_utils(opt) inference_result_path = opt.result_path / '{}.json'.format( opt.inference_subset) inference.inference(inference_loader, model, inference_result_path, inference_class_names, opt.inference_no_average, opt.output_topk)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument('--kshot', type=int, default=5, help="random seed for initialization") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() processors = {"rte": RteProcessor} output_modes = {"rte": "classification"} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] train_examples = processor.get_GAP_coreference( 'gap-development.tsv', args.kshot) #train_pu_half_v1.txt dev_examples = processor.get_GAP_coreference('gap-validation.tsv', 0) test_examples = processor.get_GAP_coreference('gap-test.tsv', 0) label_list = ["entailment", "not_entailment"] entity_label_list = ["A-coref", "B-coref"] # train_examples = get_data_hulu_fewshot('train', 5) # train_examples, dev_examples, test_examples, label_list = load_CLINC150_with_specific_domain_sequence(args.DomainName, args.kshot, augment=False) num_labels = len(label_list) print('num_labels:', num_labels, 'training size:', len(train_examples), 'dev size:', len(dev_examples), 'test size:', len(test_examples)) num_train_optimization_steps = None num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) model = RobertaForSequenceClassification(num_labels) tokenizer = RobertaTokenizer.from_pretrained( pretrain_model_dir, do_lower_case=args.do_lower_case) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) global_step = 0 nb_tr_steps = 0 tr_loss = 0 max_test_acc = 0.0 max_dev_acc = 0.0 max_dev_threshold = 0.0 if args.do_train: train_dataloader = examples_to_features(train_examples, label_list, entity_label_list, args, tokenizer, args.train_batch_size, "classification", dataloader_mode='random') dev_dataloader = examples_to_features(dev_examples, label_list, entity_label_list, args, tokenizer, args.eval_batch_size, "classification", dataloader_mode='sequential') test_dataloader = examples_to_features(test_examples, label_list, entity_label_list, args, tokenizer, args.eval_batch_size, "classification", dataloader_mode='sequential') logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) iter_co = 0 final_test_performance = 0.0 for _ in trange(int(args.num_train_epochs), desc="Epoch"): nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() batch = tuple(t.to(device) for t in batch) input_example_ids, input_ids, input_mask, segment_ids, label_ids, entity_label_ids = batch logits = model(input_ids, input_mask) loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 optimizer.step() optimizer.zero_grad() global_step += 1 iter_co += 1 # if iter_co %100==0: # print('iter_co:', iter_co, ' mean loss:', tr_loss/iter_co) if iter_co % len(train_dataloader) == 0: model.eval() ''' dev set after this epoch ''' logger.info("***** Running dev *****") logger.info(" Num examples = %d", len(dev_examples)) eval_loss = 0 nb_eval_steps = 0 preds = [] gold_label_ids = [] example_id_list = [] for _, batch in enumerate(tqdm(dev_dataloader, desc="dev")): input_indices, input_ids, input_mask, segment_ids, _, label_ids = batch input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) example_ids = list(input_indices.numpy()) example_id_list += example_ids gold_label_ids += list( label_ids.detach().cpu().numpy()) with torch.no_grad(): logits = model(input_ids, input_mask) if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) preds = preds[0] pred_probs = softmax(preds, axis=1) pred_label_ids_3way = list(np.argmax(pred_probs, axis=1)) pred_prob_entail = list(pred_probs[:, 0]) assert len(example_id_list) == len(pred_prob_entail) assert len(example_id_list) == len(gold_label_ids) assert len(example_id_list) == len(pred_label_ids_3way) best_current_dev_acc = 0.0 best_current_threshold = -10.0 for threshold in np.arange(0.99, 0.0, -0.01): eval_output_list = build_GAP_output_format( example_id_list, gold_label_ids, pred_prob_entail, pred_label_ids_3way, threshold, dev_or_test='validation') dev_acc = run_scorer( '/export/home/Dataset/gap_coreference/gap-validation.tsv', eval_output_list) if dev_acc > best_current_dev_acc: best_current_dev_acc = dev_acc best_current_threshold = threshold print('best_current_dev_threshold:', best_current_threshold, 'best_current_dev_acc:', best_current_dev_acc) if best_current_dev_acc > max_dev_acc: max_dev_acc = best_current_dev_acc max_dev_threshold = best_current_threshold '''eval on test set''' logger.info("***** Running test *****") logger.info(" Num examples = %d", len(test_examples)) eval_loss = 0 nb_eval_steps = 0 preds = [] gold_label_ids = [] example_id_list = [] for _, batch in enumerate( tqdm(test_dataloader, desc="test")): input_indices, input_ids, input_mask, segment_ids, _, label_ids = batch input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) example_ids = list(input_indices.numpy()) example_id_list += example_ids gold_label_ids += list( label_ids.detach().cpu().numpy()) with torch.no_grad(): logits = model(input_ids, input_mask) if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) preds = preds[0] pred_probs = softmax(preds, axis=1) pred_label_ids_3way = list( np.argmax(pred_probs, axis=1)) pred_prob_entail = list(pred_probs[:, 0]) assert len(example_id_list) == len(pred_prob_entail) assert len(example_id_list) == len(gold_label_ids) assert len(example_id_list) == len(pred_label_ids_3way) threshold = max_dev_threshold eval_output_list = build_GAP_output_format( example_id_list, gold_label_ids, pred_prob_entail, pred_label_ids_3way, threshold, dev_or_test='test') test_acc = run_scorer( '/export/home/Dataset/gap_coreference/gap-test.tsv', eval_output_list) if test_acc > max_test_acc: max_test_acc = test_acc print('current_test_acc:', test_acc, ' max_test_acc:', max_test_acc) final_test_performance = test_acc print('final_test_performance:', final_test_performance)
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, enc_hiddens=None, encoder_attention_mask=None, caches=None, labels=None, y_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.rembert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, enc_hiddens=enc_hiddens, encoder_attention_mask=encoder_attention_mask, caches=caches, y_cache=y_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) lm_loss = None if labels is not None: # we are doing next-token prediction; shift prediction scores and input ids by one shifted_prediction_scores = prediction_scores[:, : -1, :].contiguous() labels = labels[:, 1:].contiguous() loss_fct = CrossEntropyLoss() lm_loss = loss_fct( shifted_prediction_scores.view(-1, self.config.s_vocab), labels.view(-1)) if not return_dict: output = (prediction_scores, ) + outputs[2:] return ((lm_loss, ) + output) if lm_loss is not None else output return CausalLMOutputWithCrossAttentions( loss=lm_loss, logits=prediction_scores, caches=outputs.caches, hiddens=outputs.hiddens, attns=outputs.attns, crosses=outputs.crosses, )
def forward(self, q_vec, d_vec, sd_vec, labels=None): # embedding input vector q_emb = self.embedding(q_vec) d_emb = self.embedding(d_vec) sd_emb = self.embedding(sd_vec) q_transform = self.q_transformer(q_emb) d_transform = self.d_transformer(d_emb) sd_transform = self.sd_transformer(sd_emb) # Residual Net q_res = torch.cat((q_transform, q_emb), 2) d_res = torch.cat((d_transform, d_emb), 2) sd_res = torch.cat((sd_transform, sd_emb), 2) # alignment q_d_similarity = self.similarity(d_res, q_res) d2q = self.context_to_query(q_d_similarity, d_res) q2d = self.query_to_context(q_d_similarity, q_res) q_d_final = self.final_attention(d_res, d2q, q2d) q_sd_similarity = self.similarity(sd_res, q_res) sd2q = self.context_to_query(q_sd_similarity, sd_res) q2sd = self.query_to_context(q_sd_similarity, q_res) q_sd_final = self.final_attention(sd_res, sd2q, q2sd) q_d_concat = torch.cat((q_res, q_d_final, d_res), 2) # [batch_size, 128, embedding_size*12] q_sd_concat = torch.cat((q_res, q_sd_final, sd_res), 2) # [batch_size, 128, embedding_size*12] q_d_linear = self.q_d_Linear( q_d_concat) # [batch_size, 128, embedding_size*12] q_sd_linear = self.q_sd_Linear( q_sd_concat) # [batch_size, 128, embedding_size*12] all_concat_input = torch.cat((q_d_linear, q_sd_linear), 2) # [batch_size, 128, embedding_size*24] all_concat_input = all_concat_input.to(self.device) result_output = self.result_Linear( all_concat_input) # [batch_size, 128, embedding_size*24] result_output = result_output.to(self.device) result_output = result_output.permute(0, 2, 1) avg_pool = F.adaptive_avg_pool1d(result_output, 1) max_pool = F.adaptive_max_pool1d(result_output, 1) avg_pool = avg_pool.view(q_vec.size(0), -1) max_pool = max_pool.view(q_vec.size(0), -1) result = torch.cat((avg_pool, max_pool), 1) # [batch_size, embedding_size*48] logits = self.classifier(result) if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(logits, labels) return loss, logits else: return logits
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[ 1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view( -1, input_ids.size(-1)) if input_ids is not None else None attention_mask = (attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None) token_type_ids = (token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None) position_ids = (position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None) inputs_embeds = (inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) if inputs_embeds is not None else None) outputs = self.rembert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) pooled_output = outputs[1] pooled_output = self.drop(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) loss = None if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) if not return_dict: output = (reshaped_logits, ) + outputs[2:] return ((loss, ) + output) if loss is not None else output return qo.WithLoss( loss=loss, logits=reshaped_logits, hiddens=outputs.hiddens, attns=outputs.attns, )
def forward(self, input_ids=None, input_ids_org=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, func=None, tail_idxs=None, in_domain_rep=None, out_domain_rep=None, sentence_label=None, lm_label=None, batch_size=None, all_in_task_rep_comb=None, all_sentence_binary_label=None, from_query=False, **kwargs): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): Used to hide legacy arguments that have been deprecated. """ if "masked_lm_labels" in kwargs: warnings.warn( "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", FutureWarning, ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." return_dict = return_dict if return_dict is not None else self.config.use_return_dict if func == "in_domain_task_rep": ####### outputs = self.roberta( input_ids=input_ids_org, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) ####### #x = features[:, 0, :] # take <s> token (equiv. to [CLS]) #rep = outputs.last_hidden_state[:, 0, :] #rep = outputs.last_hidden_state[:, 0, :] rep_head = outputs.last_hidden_state[:, 0, :] rep_tail = outputs.last_hidden_state[input_ids_org == 2] #detach #rep = rep.detach() ''' in_domain_rep = self.domain_layer(rep) in_task_rep = self.task_layer(rep) return in_domain_rep, in_task_rep ''' return rep_tail, rep_head elif func == "in_domain_task_rep_mean": ####### outputs = self.roberta( input_ids=input_ids_org, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) ####### #x = features[:, 0, :] # take <s> token (equiv. to [CLS]) rep = outputs.last_hidden_state mask = rep != 0 rep = (rep * mask).sum(dim=1) / mask.sum(dim=1) #detach #rep = rep.detach() ''' in_domain_rep = self.domain_layer(rep) in_task_rep = self.task_layer(rep) return in_domain_rep, in_task_rep ''' return rep, rep elif func == "return_task_binary_classifier": return self.task_binary_classifier.weight.data, self.task_binary_classifier.bias.data elif func == "return_domain_binary_classifier": return self.domain_binary_classifier.weight.data, self.domain_binary_classifier.bias.data #if func == "task_binary_classifier": elif func == "domain_binary_classifier": #in:1 , out:0 #Need to fix ####### outputs = self.roberta( input_ids=input_ids_org, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) ####### #Didn't include query rep: so it need to add in_domain_rep here loss_fct = CrossEntropyLoss() out_domain_rep_head = outputs.last_hidden_state[:, 0, :] out_domain_rep_tail = outputs.last_hidden_state[input_ids_org == 2] #print("model_head",out_domain_rep_head.shape) #print("model_tail",out_domain_rep_tail.shape) domain_rep = torch.cat([in_domain_rep, out_domain_rep_tail], 0) #detach #domain_rep = domain_rep.detach() logit = self.domain_binary_classifier(domain_rep) logit = self.LeakyReLU(logit) pos_target = torch.tensor([1] * in_domain_rep.shape[0]).to("cuda") neg_target = torch.tensor([0] * out_domain_rep_tail.shape[0]).to("cuda") target = torch.cat([pos_target, neg_target], 0) domain_loss = loss_fct(logit, target) return domain_loss, logit, out_domain_rep_head, out_domain_rep_tail elif func == "domain_binary_classifier_mean": #in:1 , out:0 #Need to fix ####### outputs = self.roberta( input_ids=input_ids_org, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) ####### #Didn't include query rep: so it need to add in_domain_rep here loss_fct = CrossEntropyLoss() out_domain_rep = outputs.last_hidden_state ### mask = out_domain_rep != 0 out_domain_rep = (out_domain_rep * mask).sum(dim=1) / mask.sum(dim=1) ### domain_rep = torch.cat([in_domain_rep, out_domain_rep], 0) #detach #domain_rep = domain_rep.detach() logit = self.domain_binary_classifier(domain_rep) logit = self.LeakyReLU(logit) pos_target = torch.tensor([1] * in_domain_rep.shape[0]).to("cuda") neg_target = torch.tensor([0] * out_domain_rep.shape[0]).to("cuda") target = torch.cat([pos_target, neg_target], 0) domain_loss = loss_fct(logit, target) return domain_loss, logit elif func == "task_binary_classifier": #Didn't include query rep: so it need to add in_domain_rep here loss_fct = CrossEntropyLoss() #detach #all_in_task_rep_comb = all_in_task_rep_comb.detach() logit = self.task_binary_classifier(all_in_task_rep_comb) logit = self.LeakyReLU(logit) all_sentence_binary_label = all_sentence_binary_label.reshape( all_sentence_binary_label.shape[0] * all_sentence_binary_label.shape[1]) logit = logit.reshape(logit.shape[0] * logit.shape[1], logit.shape[2]) task_binary_loss = loss_fct(logit.view(-1, 2), all_sentence_binary_label.view(-1)) return task_binary_loss, logit elif func == "task_binary_classifier_mean": #Didn't include query rep: so it need to add in_domain_rep here loss_fct = CrossEntropyLoss() #detach #all_in_task_rep_comb = all_in_task_rep_comb.detach() logit = self.task_binary_classifier(all_in_task_rep_comb) logit = self.LeakyReLU(logit) all_sentence_binary_label = all_sentence_binary_label.reshape( all_sentence_binary_label.shape[0] * all_sentence_binary_label.shape[1]) logit = logit.reshape(logit.shape[0] * logit.shape[1], logit.shape[2]) task_binary_loss = loss_fct(logit.view(-1, 2), all_sentence_binary_label.view(-1)) return task_binary_loss, logit elif func == "task_class": ####### outputs = self.roberta( input_ids=input_ids_org, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) ####### #Already including query rep loss_fct = CrossEntropyLoss() ### #class_logit = self.classifier(outputs.last_hidden_state, input_ids_org) class_logit = self.classifier(outputs.last_hidden_state) task_loss = loss_fct(class_logit.view(-1, self.num_labels), sentence_label.view(-1)) if from_query == True: query_rep_head = outputs.last_hidden_state[:, 0, :] query_rep_tail = outputs.last_hidden_state[input_ids_org == 2] return task_loss, class_logit, query_rep_head, query_rep_tail else: return task_loss, class_logit elif func == "mlm": outputs_mlm = self.roberta( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) loss_fct = CrossEntropyLoss() #sequence_output = outputs_mlm.last_hidden_state sequence_output = outputs_mlm[0] prediction_scores = self.lm_head(sequence_output) loss_fct = CrossEntropyLoss(ignore_index=-1) masked_lm_loss = loss_fct( prediction_scores.view(-1, self.config.vocab_size), lm_label.view(-1)) return masked_lm_loss elif func == "task_class and mlm": ####### outputs = self.roberta( input_ids=input_ids_org, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) ####### ####### outputs_mlm = self.roberta( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) ####### #Already including query rep #task loss loss_fct = CrossEntropyLoss() ### ''' #rep = outputs.last_hidden_state[input_ids==2] rep = outputs.last_hidden_state[:, 0, :] #rep = rep.detach() task_rep = self.task_layer(rep) class_logit = self.layer_out_taskClass((self.act(task_rep))) ''' class_logit = self.classifier(outputs.last_hidden_state) ### task_loss = loss_fct(class_logit.view(-1, 8), sentence_label.view(-1)) #mlm loss sequence_output = outputs_mlm.last_hidden_state prediction_scores = self.lm_head(sequence_output) loss_fct = CrossEntropyLoss(ignore_index=-1) masked_lm_loss = loss_fct( prediction_scores.view(-1, self.config.vocab_size), lm_label.view(-1)) return task_loss, masked_lm_loss elif func == "gen_rep": outputs = self.roberta( input_ids=input_ids_org, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) return outputs '''
def forward( self, input_ids=None, attention_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, head_mask=None, cross_attn_head_mask=None, past_key_values=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" Args: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it. Indices can be obtained using [`Speech2Text2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional tensors are only required when the model is used as a decoder in a Sequence to Sequence model. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`. labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. use_cache (`bool`, *optional*): If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. Returns: Example: ```python >>> from transformers import ( ... SpeechEncoderDecoderModel, ... Speech2Text2ForCausalLM, ... Wav2Vec2Model, ... Speech2Text2Config, ... Wav2Vec2Config, ... Wav2Vec2FeatureExtractor, ... Speech2Text2Tokenizer, ... ) >>> from datasets import load_dataset >>> feature_extractor = Wav2Vec2FeatureExtractor() >>> tokenizer = Speech2Text2Tokenizer.from_pretrained("facebook/s2t-wav2vec2-large-en-de") >>> encoder = Wav2Vec2Model(Wav2Vec2Config()) >>> decoder = Speech2Text2ForCausalLM(Speech2Text2Config()) >>> # init random speech2text model >>> model = SpeechEncoderDecoderModel(encoder=encoder, decoder=decoder) >>> model.config.pad_token_id = tokenizer.pad_token_id >>> model.config.decoder_start_token_id = tokenizer.bos_token_id >>> # pre-process inputs and labels >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") >>> inputs = feature_extractor( ... ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt" ... ) >>> input_values = inputs.input_values >>> decoder_input_ids = tokenizer(ds[0]["text"], return_tensors="pt").input_ids >>> # compute loss >>> loss = model(inputs=input_values, labels=decoder_input_ids).loss >>> # backprop loss >>> loss.backward() # doctest: +IGNORE_RESULT ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states) return_dict = return_dict if return_dict is not None else self.config.use_return_dict # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) outputs = self.model.decoder( input_ids=input_ids, attention_mask=attention_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, head_mask=head_mask, cross_attn_head_mask=cross_attn_head_mask, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) logits = self.lm_head(outputs[0]) loss = None if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) if not return_dict: output = (logits, ) + outputs[1:] return (loss, ) + output if loss is not None else output return CausalLMOutputWithCrossAttentions( loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, cross_attentions=outputs.cross_attentions, )
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) ##### #return outputs ##### sequence_output = outputs[0] logits = self.classifier(sequence_output) loss = None if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (logits, ) + outputs[2:] return ((loss, ) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )
def Train(model, t, loader, eps_scheduler, max_eps, norm, logger, verbose, train, opt, method, **kwargs): # if train=True, use training mode # if train=False, use test mode, no back prop num_class = 10 losses = AverageMeter() l1_losses = AverageMeter() errors = AverageMeter() robust_errors = AverageMeter() regular_ce_losses = AverageMeter() robust_ce_losses = AverageMeter() relu_activities = AverageMeter() bound_bias = AverageMeter() bound_diff = AverageMeter() unstable_neurons = AverageMeter() dead_neurons = AverageMeter() alive_neurons = AverageMeter() batch_time = AverageMeter() batch_multiplier = kwargs.get("batch_multiplier", 1) kappa = 1 beta = 1 if train: model.train() else: model.eval() # pregenerate the array for specifications, will be used for scatter sa = np.zeros((num_class, num_class - 1), dtype=np.int32) for i in range(sa.shape[0]): for j in range(sa.shape[1]): if j < i: sa[i][j] = j else: sa[i][j] = j + 1 sa = torch.LongTensor(sa) batch_size = loader.batch_size * batch_multiplier if batch_multiplier > 1 and train: logger.log( 'Warning: Large batch training. The equivalent batch size is {} * {} = {}.' .format(batch_multiplier, loader.batch_size, batch_size)) # per-channel std and mean std = torch.tensor(loader.std).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) mean = torch.tensor(loader.mean).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) model_range = 0.0 end_eps = eps_scheduler.get_eps(t + 1, 0) if end_eps < np.finfo(np.float32).tiny: logger.log('eps {} close to 0, using natural training'.format(end_eps)) method = "natural" lb_batches = [] for i, (data, labels) in enumerate(loader): start = time.time() eps = eps_scheduler.get_eps(t, int(i // batch_multiplier)) if train and i % batch_multiplier == 0: opt.zero_grad() # generate specifications c = torch.eye(num_class).type_as(data)[labels].unsqueeze( 1) - torch.eye(num_class).type_as(data).unsqueeze(0) # remove specifications to self I = (~(labels.data.unsqueeze(1) == torch.arange(num_class).type_as( labels.data).unsqueeze(0))) c = (c[I].view(data.size(0), num_class - 1, num_class)) # scatter matrix to avoid compute margin to self sa_labels = sa[labels] # storing computed lower bounds after scatter lb_s = torch.zeros(data.size(0), num_class) ub_s = torch.zeros(data.size(0), num_class) # FIXME: Assume unnormalized data is from range 0 - 1 if kwargs["bounded_input"]: if norm != np.inf: raise ValueError( "bounded input only makes sense for Linf perturbation. " "Please set the bounded_input option to false.") data_max = torch.reshape((1. - mean) / std, (1, -1, 1, 1)) data_min = torch.reshape((0. - mean) / std, (1, -1, 1, 1)) data_ub = torch.min(data + (eps / std), data_max) data_lb = torch.max(data - (eps / std), data_min) else: if norm == np.inf: data_ub = data + (eps / std) data_lb = data - (eps / std) else: # For other norms, eps will be used instead. data_ub = data_lb = data if list(model.parameters())[0].is_cuda: data = data.cuda() data_ub = data_ub.cuda() data_lb = data_lb.cuda() labels = labels.cuda() c = c.cuda() sa_labels = sa_labels.cuda() lb_s = lb_s.cuda() ub_s = ub_s.cuda() # convert epsilon to a tensor eps_tensor = data.new(1) eps_tensor[0] = eps # omit the regular cross entropy, since we use robust error output = model(data, method_opt="forward", disable_multi_gpu=(method == "natural")) regular_ce = CrossEntropyLoss()(output, labels) regular_ce_losses.update(regular_ce.cpu().detach().numpy(), data.size(0)) errors.update( torch.sum( torch.argmax(output, dim=1) != labels).cpu().detach().numpy() / data.size(0), data.size(0)) # get range statistic model_range = output.max().detach().cpu().item() - output.min().detach( ).cpu().item() ''' torch.set_printoptions(threshold=5000) print('prediction: ', output) ub, lb, _, _, _, _ = model(norm=norm, x_U=data_ub, x_L=data_lb, eps=eps, C=c, method_opt="interval_range") lb = lb_s.scatter(1, sa_labels, lb) ub = ub_s.scatter(1, sa_labels, ub) print('interval ub: ', ub) print('interval lb: ', lb) ub, _, lb, _ = model(norm=norm, x_U=data_ub, x_L=data_lb, eps=eps, C=c, upper=True, lower=True, method_opt="backward_range") lb = lb_s.scatter(1, sa_labels, lb) ub = ub_s.scatter(1, sa_labels, ub) print('crown-ibp ub: ', ub) print('crown-ibp lb: ', lb) ub, _, lb, _ = model(norm=norm, x_U=data_ub, x_L=data_lb, eps=eps, C=c, upper=True, lower=True, method_opt="full_backward_range") lb = lb_s.scatter(1, sa_labels, lb) ub = ub_s.scatter(1, sa_labels, ub) print('full-crown ub: ', ub) print('full-crown lb: ', lb) input() ''' if verbose or method != "natural": if kwargs["bound_type"] == "convex-adv": # Wong and Kolter's bound, or equivalently Fast-Lin if kwargs["convex-proj"] is not None: proj = kwargs["convex-proj"] if norm == np.inf: norm_type = "l1_median" elif norm == 2: norm_type = "l2_normal" else: raise (ValueError( "Unsupported norm {} for convex-adv".format(norm))) else: proj = None if norm == np.inf: norm_type = "l1" elif norm == 2: norm_type = "l2" else: raise (ValueError( "Unsupported norm {} for convex-adv".format(norm))) if loader.std == [1] or loader.std == [1, 1, 1]: convex_eps = eps else: convex_eps = eps / np.mean(loader.std) # for CIFAR we are roughly / 0.2 # FIXME this is due to a bug in convex_adversarial, we cannot use per-channel eps if norm == np.inf: # bounded input is only for Linf if kwargs["bounded_input"]: # FIXME the bounded projection in convex_adversarial has a bug, data range must be positive assert loader.std == [1, 1, 1] or loader.std == [1] data_l = 0.0 data_u = 1.0 else: data_l = -np.inf data_u = np.inf else: data_l = data_u = None f = DualNetwork(model, data, convex_eps, proj=proj, norm_type=norm_type, bounded_input=kwargs["bounded_input"], data_l=data_l, data_u=data_u) lb = f(c) elif kwargs["bound_type"] == "interval": ub, lb, relu_activity, unstable, dead, alive = model( norm=norm, x_U=data_ub, x_L=data_lb, eps=eps, C=c, method_opt="interval_range") elif kwargs["bound_type"] == "crown-full": _, _, lb, _ = model(norm=norm, x_U=data_ub, x_L=data_lb, eps=eps, C=c, upper=False, lower=True, method_opt="full_backward_range") unstable = dead = alive = relu_activity = torch.tensor([0]) elif kwargs["bound_type"] == "crown-interval": # Enable multi-GPU only for the computationally expensive CROWN-IBP bounds, # not for regular forward propagation and IBP because the communication overhead can outweigh benefits, giving little speedup. ub, ilb, relu_activity, unstable, dead, alive = model( norm=norm, x_U=data_ub, x_L=data_lb, eps=eps, C=c, method_opt="interval_range") crown_final_beta = kwargs['final-beta'] beta = (max_eps - eps * (1.0 - crown_final_beta)) / max_eps if beta < 1e-5: lb = ilb else: if kwargs["runnerup_only"]: # regenerate a smaller c, with just the runner-up prediction # mask ground truthlabel output, select the second largest class # print(output) # torch.set_printoptions(threshold=5000) masked_output = output.detach().scatter( 1, labels.unsqueeze(-1), -100) # print(masked_output) # location of the runner up prediction runner_up = masked_output.max(1)[1] # print(runner_up) # print(labels) # get margin from the groud-truth to runner-up only runnerup_c = torch.eye(num_class).type_as(data)[labels] # print(runnerup_c) # set the runner up location to - runnerup_c.scatter_(1, runner_up.unsqueeze(-1), -1) runnerup_c = runnerup_c.unsqueeze(1).detach() # print(runnerup_c) # get the bound for runnerup_c _, _, clb, bias = model(norm=norm, x_U=data_ub, x_L=data_lb, eps=eps, C=c, method_opt="backward_range") clb = clb.expand(clb.size(0), num_class - 1) else: # get the CROWN bound using interval bounds _, _, clb, bias = model(norm=norm, x_U=data_ub, x_L=data_lb, eps=eps, C=c, method_opt="backward_range") bound_bias.update(bias.sum() / data.size(0)) # how much better is crown-ibp better than ibp? diff = (clb - ilb).sum().item() bound_diff.update(diff / data.size(0), data.size(0)) # lb = torch.max(lb, clb) lb = clb * beta + ilb * (1 - beta) else: raise RuntimeError("Unknown bound_type " + kwargs["bound_type"]) lb = lb_s.scatter(1, sa_labels, lb) robust_ce = CrossEntropyLoss()(-lb, labels) if kwargs["bound_type"] != "convex-adv": relu_activities.update( relu_activity.sum().detach().cpu().item() / data.size(0), data.size(0)) unstable_neurons.update( unstable.sum().detach().cpu().item() / data.size(0), data.size(0)) dead_neurons.update( dead.sum().detach().cpu().item() / data.size(0), data.size(0)) alive_neurons.update( alive.sum().detach().cpu().item() / data.size(0), data.size(0)) if method == "robust": loss = robust_ce elif method == "robust_activity": loss = robust_ce + kwargs["activity_reg"] * relu_activity.sum() elif method == "natural": loss = regular_ce elif method == "robust_natural": natural_final_factor = kwargs["final-kappa"] kappa = (max_eps - eps * (1.0 - natural_final_factor)) / max_eps loss = (1 - kappa) * robust_ce + kappa * regular_ce else: raise ValueError("Unknown method " + method) if train and kwargs["l1_reg"] > np.finfo(np.float32).tiny: reg = kwargs["l1_reg"] l1_loss = 0.0 for name, param in model.named_parameters(): if 'bias' not in name: l1_loss = l1_loss + torch.sum(torch.abs(param)) l1_loss = reg * l1_loss loss = loss + l1_loss l1_losses.update(l1_loss.cpu().detach().numpy(), data.size(0)) if train: loss.backward() if i % batch_multiplier == 0 or i == len(loader) - 1: opt.step() losses.update(loss.cpu().detach().numpy(), data.size(0)) if verbose or method != "natural": robust_ce_losses.update(robust_ce.cpu().detach().numpy(), data.size(0)) # robust_ce_losses.update(robust_ce, data.size(0)) robust_errors.update( torch.sum( (lb < 0).any(dim=1)).cpu().detach().numpy() / data.size(0), data.size(0)) batch_time.update(time.time() - start) if i % 50 == 0 and train: logger.log( '[{:2d}:{:4d}]: eps {:4f} ' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f}) ' 'Total Loss {loss.val:.4f} ({loss.avg:.4f}) ' 'L1 Loss {l1_loss.val:.4f} ({l1_loss.avg:.4f}) ' 'CE {regular_ce_loss.val:.4f} ({regular_ce_loss.avg:.4f}) ' 'RCE {robust_ce_loss.val:.4f} ({robust_ce_loss.avg:.4f}) ' 'Err {errors.val:.4f} ({errors.avg:.4f}) ' 'Rob Err {robust_errors.val:.4f} ({robust_errors.avg:.4f}) ' 'Uns {unstable.val:.1f} ({unstable.avg:.1f}) ' 'Dead {dead.val:.1f} ({dead.avg:.1f}) ' 'Alive {alive.val:.1f} ({alive.avg:.1f}) ' 'Tightness {tight.val:.5f} ({tight.avg:.5f}) ' 'Bias {bias.val:.5f} ({bias.avg:.5f}) ' 'Diff {diff.val:.5f} ({diff.avg:.5f}) ' 'R {model_range:.3f} ' 'beta {beta:.3f} ({beta:.3f}) ' 'kappa {kappa:.3f} ({kappa:.3f}) '.format( t, i, eps, batch_time=batch_time, loss=losses, errors=errors, robust_errors=robust_errors, l1_loss=l1_losses, regular_ce_loss=regular_ce_losses, robust_ce_loss=robust_ce_losses, unstable=unstable_neurons, dead=dead_neurons, alive=alive_neurons, tight=relu_activities, bias=bound_bias, diff=bound_diff, model_range=model_range, beta=beta, kappa=kappa)) if verbose or method != "natural": lb_batches.append(lb) logger.log('[FINAL RESULT epoch:{:2d} eps:{:.4f}]: ' 'Time {batch_time.val:.3f} ({batch_time.avg:.3f}) ' 'Total Loss {loss.val:.4f} ({loss.avg:.4f}) ' 'L1 Loss {l1_loss.val:.4f} ({l1_loss.avg:.4f}) ' 'CE {regular_ce_loss.val:.4f} ({regular_ce_loss.avg:.4f}) ' 'RCE {robust_ce_loss.val:.4f} ({robust_ce_loss.avg:.4f}) ' 'Uns {unstable.val:.3f} ({unstable.avg:.3f}) ' 'Dead {dead.val:.1f} ({dead.avg:.1f}) ' 'Alive {alive.val:.1f} ({alive.avg:.1f}) ' 'Tight {tight.val:.5f} ({tight.avg:.5f}) ' 'Bias {bias.val:.5f} ({bias.avg:.5f}) ' 'Diff {diff.val:.5f} ({diff.avg:.5f}) ' 'Err {errors.val:.4f} ({errors.avg:.4f}) ' 'Rob Err {robust_errors.val:.4f} ({robust_errors.avg:.4f}) ' 'R {model_range:.3f} ' 'beta {beta:.3f} ({beta:.3f}) ' 'kappa {kappa:.3f} ({kappa:.3f}) \n'.format( t, eps, batch_time=batch_time, loss=losses, errors=errors, robust_errors=robust_errors, l1_loss=l1_losses, regular_ce_loss=regular_ce_losses, robust_ce_loss=robust_ce_losses, unstable=unstable_neurons, dead=dead_neurons, alive=alive_neurons, tight=relu_activities, bias=bound_bias, diff=bound_diff, model_range=model_range, kappa=kappa, beta=beta)) for i, l in enumerate( model if isinstance(model, BoundSequential) else model.module): if isinstance(l, BoundLinear) or isinstance(l, BoundConv2d): norm = l.weight.data.detach().view(l.weight.size(0), -1).abs().sum(1).max().cpu() logger.log('layer {} norm {}'.format(i, norm)) if method == "natural": return errors.avg, errors.avg, lb_batches else: return robust_errors.avg, errors.avg, lb_batches
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) loss = None if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (logits, ) + outputs[2:] return ((loss, ) + output) if loss is not None else output return TokenClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )
def forward( self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, mc_token_ids=None, lm_labels=None, mc_labels=None, ): r""" mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input) Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) - 1[``. lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`) Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`) Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs: lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``lm_labels`` is provided): Language modeling loss. mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`multiple_choice_labels` is provided): Multiple choice classification loss. lm_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). mc_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import torch from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2DoubleHeadsModel.from_pretrained('gpt2') # Add a [CLS] to the vocabulary (we should train it also!) tokenizer.add_special_tokens({'cls_token': '[CLS]'}) model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] encoded_choices = [tokenizer.encode(s) for s in choices] cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices] input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2 mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1 outputs = model(input_ids, mc_token_ids=mc_token_ids) lm_prediction_scores, mc_prediction_scores = outputs[:2] """ transformer_outputs = self.transformer( input_ids, past=past, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1) outputs = (lm_logits, mc_logits) + transformer_outputs[1:] if mc_labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)) outputs = (loss, ) + outputs if lm_labels is not None: shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = lm_labels[..., 1:].contiguous() loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss, ) + outputs return outputs # (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the training files for the CoNLL-2003 NER task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_TYPES), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) # Other parameters parser.add_argument( "--labels", default="", type=str, help= "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.") parser.add_argument("--keep_accents", action="store_const", const=True, help="Set this flag if model is trained with accents.") parser.add_argument( "--strip_accents", action="store_const", const=True, help="Set this flag if model is trained without accents.") parser.add_argument("--use_fast", action="store_const", const=True, help="Set this flag to use fast tokenization.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory") parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Prepare CONLL-2003 task labels = get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, id2label={str(i): label for i, label in enumerate(labels)}, label2id={label: i for i, label in enumerate(labels)}, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer_args = { k: v for k, v in vars(args).items() if v is not None and k in TOKENIZER_ARGS } logger.info("Tokenizer arguments: %s", tokenizer_args) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir if args.cache_dir else None, **tokenizer_args, ) model = AutoModelForTokenClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train") global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = AutoTokenizer.from_pretrained(args.output_dir, **tokenizer_args) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" model = AutoModelForTokenClassification.from_pretrained(checkpoint) model.to(args.device) result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step) if global_step: result = { "{}_{}".format(global_step, k): v for k, v in result.items() } results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) if args.do_predict and args.local_rank in [-1, 0]: tokenizer = AutoTokenizer.from_pretrained(args.output_dir, **tokenizer_args) model = AutoModelForTokenClassification.from_pretrained( args.output_dir) model.to(args.device) result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test") # Save results output_test_results_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_results_file, "w") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt") with open(output_test_predictions_file, "w") as writer: with open(os.path.join(args.data_dir, "test.txt"), "r") as f: example_id = 0 for line in f: if line.startswith( "-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not predictions[example_id]: example_id += 1 elif predictions[example_id]: output_line = line.split( )[0] + " " + predictions[example_id].pop(0) + "\n" writer.write(output_line) else: logger.warning( "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0]) return results
def forward( self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided) Language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ transformer_outputs = self.transformer( input_ids, past=past, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) outputs = (lm_logits,) + transformer_outputs[1:] if labels is not None: # Shift so that tokens < n predict n shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), lm_logits, presents, (all hidden_states), (attentions)
def __init__(self, criterion=None, lr: float = 0.001, momentum=0.9, l2=0.0005, train_epochs: int = 4, init_update_rate: float = 0.01, inc_update_rate=0.00005, max_r_max=1.25, max_d_max=0.5, inc_step=4.1e-05, rm_sz: int = 1500, freeze_below_layer: str = "lat_features.19.bn.beta", latent_layer_num: int = 19, ewc_lambda: float = 0, train_mb_size: int = 128, eval_mb_size: int = 128, device=None, plugins: Optional[Sequence[StrategyPlugin]] = None, evaluator: EvaluationPlugin = default_logger, eval_every=-1): """ Creates an instance of the AR1 strategy. :param criterion: The loss criterion to use. Defaults to None, in which case the cross entropy loss is used. :param lr: The learning rate (SGD optimizer). :param momentum: The momentum (SGD optimizer). :param l2: The L2 penalty used for weight decay. :param train_epochs: The number of training epochs. Defaults to 4. :param init_update_rate: The initial update rate of BatchReNorm layers. :param inc_update_rate: The incremental update rate of BatchReNorm layers. :param max_r_max: The maximum r value of BatchReNorm layers. :param max_d_max: The maximum d value of BatchReNorm layers. :param inc_step: The incremental step of r and d values of BatchReNorm layers. :param rm_sz: The size of the replay buffer. The replay buffer is shared across classes. Defaults to 1500. :param freeze_below_layer: A string describing the name of the layer to use while freezing the lower (nearest to the input) part of the model. The given layer is not frozen (exclusive). :param latent_layer_num: The number of the layer to use as the Latent Replay Layer. Usually this is the same of `freeze_below_layer`. :param ewc_lambda: The Synaptic Intelligence lambda term. Defaults to 0, which means that the Synaptic Intelligence regularization will not be applied. :param train_mb_size: The train minibatch size. Defaults to 128. :param eval_mb_size: The eval minibatch size. Defaults to 128. :param device: The device to use. Defaults to None (cpu). :param plugins: (optional) list of StrategyPlugins. :param evaluator: (optional) instance of EvaluationPlugin for logging and metric computations. :param eval_every: the frequency of the calls to `eval` inside the training loop. if -1: no evaluation during training. if 0: calls `eval` after the final epoch of each training experience. if >0: calls `eval` every `eval_every` epochs and at the end of all the epochs for a single experience. """ warnings.warn("The AR1 strategy implementation is in an alpha stage " "and is not perfectly aligned with the paper " "implementation. Please use at your own risk!") if plugins is None: plugins = [] # Model setup model = MobilenetV1(pretrained=True, latent_layer_num=latent_layer_num) replace_bn_with_brn(model, momentum=init_update_rate, r_d_max_inc_step=inc_step, max_r_max=max_r_max, max_d_max=max_d_max) fc_name, fc_layer = get_last_fc_layer(model) if ewc_lambda != 0: # Synaptic Intelligence is not applied to the last fully # connected layer (and implicitly to "freeze below" ones. plugins.append( SynapticIntelligencePlugin(ewc_lambda, excluded_parameters=[fc_name])) self.cwr_plugin = CWRStarPlugin(model, cwr_layer_name=fc_name, freeze_remaining_model=False) plugins.append(self.cwr_plugin) optimizer = SGD(model.parameters(), lr=lr, momentum=momentum, weight_decay=l2) if criterion is None: criterion = CrossEntropyLoss() self.ewc_lambda = ewc_lambda self.freeze_below_layer = freeze_below_layer self.rm_sz = rm_sz self.inc_update_rate = inc_update_rate self.max_r_max = max_r_max self.max_d_max = max_d_max self.lr = lr self.momentum = momentum self.l2 = l2 self.rm = None self.cur_acts: Optional[Tensor] = None self.replay_mb_size = 0 super().__init__(model, optimizer, criterion, train_mb_size=train_mb_size, train_epochs=train_epochs, eval_mb_size=eval_mb_size, device=device, plugins=plugins, evaluator=evaluator, eval_every=eval_every)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, lus=None, senses=None, args=None, using_gold_fame=False, position_ids=None, head_mask=None): sequence_output, pooled_output = self.bert( input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) sequence_output = self.dropout(sequence_output) pooled_output = self.dropout(pooled_output) # with torch.no_grad(): # pooled_output = self.mlp_model(pooled_output) # self.mlp_model.train() # self.mlp_model.eval() # pooled_output = self.mlp_model(pooled_output) sense_logits = self.sense_classifier(pooled_output) arg_logits = self.arg_classifier(sequence_output) lufr_masks = utils.get_masks(lus, self.lufrmap, num_label=self.num_senses, masking=self.masking).to(device) sense_loss = 0 # loss for sense id arg_loss = 0 # loss for arg id if senses is not None: for i in range(len(sense_logits)): sense_logit = sense_logits[i] arg_logit = arg_logits[i] lufr_mask = lufr_masks[i] gold_sense = senses[i] gold_arg = args[i] #train sense classifier loss_fct_sense = CrossEntropyLoss(weight=lufr_mask) loss_per_seq_for_sense = loss_fct_sense( sense_logit.view(-1, self.num_senses), gold_sense.view(-1)) sense_loss += loss_per_seq_for_sense #train arg classifier masked_sense_logit = utils.masking_logit( sense_logit, lufr_mask) pred_sense, sense_score = utils.logit2label(masked_sense_logit) frarg_mask = utils.get_masks([pred_sense], self.frargmap, num_label=self.num_args, masking=True).to(device)[0] loss_fct_arg = CrossEntropyLoss(weight=frarg_mask) # only keep active parts of loss if attention_mask is not None: active_loss = attention_mask[i].view(-1) == 1 active_logits = arg_logit.view(-1, self.num_args)[active_loss] active_labels = gold_arg.view(-1)[active_loss] loss_per_seq_for_arg = loss_fct_arg( active_logits, active_labels) else: loss_per_seq_for_arg = loss_fct_arg( arg_logit.view(-1, self.num_args), gold_arg.view(-1)) arg_loss += loss_per_seq_for_arg # 0.5 weighted loss # if self.original_loss: # loss = (sense_loss, arg_loss) # else: total_loss = 0.5 * sense_loss + 0.5 * arg_loss loss = total_loss / len(sense_logits) if self.return_pooled_output: return pooled_output, loss else: return loss else: if self.return_pooled_output: return pooled_output, sense_logits, arg_logits else: return sense_logits, arg_logits
def main(): argument_parser = argparse.ArgumentParser() argument_parser.add_argument("--path_to_train_data", type=str, required=True) argument_parser.add_argument("--path_to_eval_data", type=str, required=False, default=None) argument_parser.add_argument("--n_epochs", type=int, required=False, default=3) argument_parser.add_argument("--batch_size", type=int, required=False, default=32) argument_parser.add_argument("--bptt", type=int, required=False, default=64) argument_parser.add_argument("--lr", type=float, required=False, default=0.0001) argument_parser.add_argument("--vocabulary_size", type=int, required=False, default=20000) argument_parser.add_argument("--embedding_dimension", type=int, required=False, default=300) argument_parser.add_argument("--hidden_units_for_lstm", type=int, required=False, default=256) argument_parser.add_argument("--num_of_lstm_layer", type=int, required=False, default=1) argument_parser.add_argument("--n_decoder_blocks", type=int, required=False, default=5) arguments = argument_parser.parse_args() train_language_modeling_dataset = LanguageModelingDataset( arguments.batch_size, arguments.bptt) train_language_modeling_dataset.set_tokenizer(ByteLevelBPETokenizer()) train_language_modeling_dataset.fit( arguments.path_to_train_data, vocabulary_size=arguments.vocabulary_size) train_language_modeling_dataloader = LanguageModelingDataLoader( arguments.bptt, train_language_modeling_dataset.transform(arguments.path_to_train_data, return_target=True), ) model = LSTMModel( arguments.vocabulary_size, arguments.embedding_dimension, arguments.hidden_units_for_lstm, arguments.n_decoder_blocks, arguments.num_of_lstm_layer, ) logger = TensorboardLogger() trainer = Trainer(arguments.batch_size) trainer.set_logger(logger) if arguments.path_to_eval_data: eval_language_modeling_dataloader = LanguageModelingDataLoader( arguments.bptt, train_language_modeling_dataset.transform( arguments.path_to_eval_data, return_target=True), ) trainer.train( model, train_language_modeling_dataloader, CrossEntropyLoss(), Adam(model.parameters(), arguments.lr), eval_language_modeling_dataloader, arguments.n_epochs, ) else: trainer.train( model, train_language_modeling_dataloader, CrossEntropyLoss(), Adam(model.parameters(), arguments.lr), None, arguments.n_epochs, ) logger.log_params(vars(arguments), trainer.losses) saver = Saver(logger.log_dir()) saver.save_preprocessor_and_model(train_language_modeling_dataset, model)
def forward( self, input_ids=None, first_check=None, position=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs: loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided) Language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: import torch from transformers import GPT2Tokenizer, GPT2LMHeadModel tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=input_ids) loss, logits = outputs[:2] """ transformer_outputs, position = self.transformer( input_ids, first_check, position, past=past, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) return transformer_outputs, position hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) outputs = (lm_logits, ) + transformer_outputs[1:] if labels is not None: # Shift so that tokens < n predict n shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss, ) + outputs return outputs # (loss), lm_logits, presents, (all hidden_states), (attentions)