def forward( self, input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, past_key_values=past_key_values, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = transformer_outputs[0] logits = self.classifier(hidden_states) if input_ids is not None: batch_size, sequence_length = input_ids.shape[:2] else: batch_size, sequence_length = inputs_embeds.shape[:2] assert ( self.config.pad_token_id is not None or batch_size == 1 ), "Cannot handle batch sizes > 1 if no padding token is defined." if self.config.pad_token_id is None: sequence_lengths = -1 else: if input_ids is not None: sequence_lengths = torch.ne( input_ids, self.config.pad_token_id).sum(-1) - 1 else: sequence_lengths = -1 logger.warning( f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " f"unexpected if using padding tokens in conjunction with `inputs_embeds.`" ) pooled_logits = logits[range(batch_size), sequence_lengths] loss = None if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (pooled_logits, ) + transformer_outputs[2:] return ((loss, ) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=pooled_logits, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, )
def forward( self, input_ids=None, bbox=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: Examples:: >>> from transformers import LayoutLMTokenizer, LayoutLMForTokenClassification >>> import torch >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased') >>> model = LayoutLMForTokenClassification.from_pretrained('microsoft/layoutlm-base-uncased') >>> words = ["Hello", "world"] >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782] >>> token_boxes = [] >>> for word, box in zip(words, normalized_word_boxes): ... word_tokens = tokenizer.tokenize(word) ... token_boxes.extend([box] * len(word_tokens)) >>> # add bounding boxes of cls + sep tokens >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]] >>> encoding = tokenizer(' '.join(words), return_tensors="pt") >>> input_ids = encoding["input_ids"] >>> attention_mask = encoding["attention_mask"] >>> token_type_ids = encoding["token_type_ids"] >>> bbox = torch.tensor([token_boxes]) >>> token_labels = torch.tensor([1,1,0,0]).unsqueeze(0) # batch size of 1 >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids, ... labels=token_labels) >>> loss = outputs.loss >>> logits = outputs.logits """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.layoutlm( input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) loss = None if labels is not None: loss_fct = CrossEntropyLoss() if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output return TokenClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )
def forward( self, input_ids=None, bbox=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: Examples:: >>> from transformers import LayoutLMTokenizer, LayoutLMForSequenceClassification >>> import torch >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased') >>> model = LayoutLMForSequenceClassification.from_pretrained('microsoft/layoutlm-base-uncased') >>> words = ["Hello", "world"] >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782] >>> token_boxes = [] >>> for word, box in zip(words, normalized_word_boxes): ... word_tokens = tokenizer.tokenize(word) ... token_boxes.extend([box] * len(word_tokens)) >>> # add bounding boxes of cls + sep tokens >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]] >>> encoding = tokenizer(' '.join(words), return_tensors="pt") >>> input_ids = encoding["input_ids"] >>> attention_mask = encoding["attention_mask"] >>> token_type_ids = encoding["token_type_ids"] >>> bbox = torch.tensor([token_boxes]) >>> sequence_label = torch.tensor([1]) >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids, ... labels=sequence_label) >>> loss = outputs.loss >>> logits = outputs.logits """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.layoutlm( input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) loss = None if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output return SequenceClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )
def forward( self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None, position_ids=None, head_mask=None, inputs_embeds=None, output_attentions=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: loss (:obj:`torch.FloatTensor`` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): Classification loss. classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import RobertaTokenizer, RobertaForMultipleChoice import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForMultipleChoice.from_pretrained('roberta-base') choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices labels = torch.tensor(1).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, classification_scores = outputs[:2] """ num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None flat_inputs_embeds = ( inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) if inputs_embeds is not None else None ) outputs = self.roberta( flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, attention_mask=flat_attention_mask, head_mask=head_mask, inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) reshaped_logits = logits.view(-1, num_choices) outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) outputs = (loss,) + outputs return outputs # (loss), reshaped_logits, (hidden_states), (attentions)
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, output_attentions=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: # The checkpoint roberta-large is not fine-tuned for question answering. Please see the # examples/question-answering/run_squad.py example to see how to fine-tune a model to a question answering task. from transformers import RobertaTokenizer, RobertaForQuestionAnswering import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForQuestionAnswering.from_pretrained('roberta-base') question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" input_ids = tokenizer.encode(question, text) start_scores, end_scores = model(torch.tensor([input_ids])) all_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]) """ outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) outputs = (start_logits, end_logits,) + outputs[2:] if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 outputs = (total_loss,) + outputs return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_eval_on_train", action='store_true', help="Whether to run eval on the train set.") parser.add_argument("--do_test", action='store_true', help="Whether to run test and create submission.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--seed", default=None, type=int, help="Seed for randomized elements in the training") parser.add_argument("--eval_batch_size", default=16, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") ## Our arguements parser.add_argument( "--mode", choices=[ "none", "distill", "smoothed_distill", "smoothed_distill_annealed", "label_smoothing", "theta_smoothed_distill", "reweight_baseline", "smoothed_reweight_baseline", "permute_smoothed_distill", "bias_product_baseline", "learned_mixin_baseline", "reweight_by_teacher", "reweight_by_teacher_annealed", "bias_product_by_teacher", "bias_product_by_teacher_annealed", "focal_loss" ]) parser.add_argument("--penalty", type=float, default=0.03, help="Penalty weight for the learn_mixin model") parser.add_argument("--focal_loss_gamma", type=float, default=1.0) parser.add_argument("--n_processes", type=int, default=4, help="Processes to use for pre-processing") parser.add_argument("--debug", action="store_true") parser.add_argument("--debug_num", type=int, default=2000) parser.add_argument( "--sorted", action="store_true", help='Sort the data so most batches have the same input length,' ' makes things about 2x faster. Our experiments did not actually' ' use this in the end (not sure if it makes a difference) so ' 'its off by default.') parser.add_argument("--which_bias", choices=["hans", "hypo", "hans_json", "mix", "dam"], required=True) parser.add_argument("--custom_teacher", default=None) parser.add_argument("--custom_bias", default=None) parser.add_argument("--theta", type=float, default=0.1, help="for theta smoothed distillation loss") parser.add_argument("--add_bias_on_eval", action="store_true") args = parser.parse_args() utils.add_stdout_logger() if args.mode == "none": loss_fn = clf_distill_loss_functions.Plain() elif args.mode == "distill": loss_fn = clf_distill_loss_functions.DistillLoss() elif args.mode == "smoothed_distill": loss_fn = clf_distill_loss_functions.SmoothedDistillLoss() elif args.mode == "smoothed_distill_annealed": loss_fn = clf_distill_loss_functions.SmoothedDistillLossAnnealed() elif args.mode == "theta_smoothed_distill": loss_fn = clf_distill_loss_functions.ThetaSmoothedDistillLoss( args.theta) elif args.mode == "label_smoothing": loss_fn = clf_distill_loss_functions.LabelSmoothing(3) elif args.mode == "reweight_baseline": loss_fn = clf_distill_loss_functions.ReweightBaseline() elif args.mode == "permute_smoothed_distill": loss_fn = clf_distill_loss_functions.PermuteSmoothedDistillLoss() elif args.mode == "smoothed_reweight_baseline": loss_fn = clf_distill_loss_functions.SmoothedReweightLoss() elif args.mode == "bias_product_baseline": loss_fn = clf_distill_loss_functions.BiasProductBaseline() elif args.mode == "learned_mixin_baseline": loss_fn = clf_distill_loss_functions.LearnedMixinBaseline(args.penalty) elif args.mode == "reweight_by_teacher": loss_fn = clf_distill_loss_functions.ReweightByTeacher() elif args.mode == "reweight_by_teacher_annealed": loss_fn = clf_distill_loss_functions.ReweightByTeacherAnnealed() elif args.mode == "bias_product_by_teacher": loss_fn = clf_distill_loss_functions.BiasProductByTeacher() elif args.mode == "bias_product_by_teacher_annealed": loss_fn = clf_distill_loss_functions.BiasProductByTeacherAnnealed() elif args.mode == "focal_loss": loss_fn = clf_distill_loss_functions.FocalLoss( gamma=args.focal_loss_gamma) else: raise RuntimeError() output_dir = args.output_dir if args.do_train: if exists(output_dir): if len(os.listdir(output_dir)) > 0: logging.warning("Output dir exists and is non-empty") else: os.makedirs(output_dir) print("Saving model to %s" % output_dir) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if args.seed is not None: random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval and not args.do_test: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(output_dir) and os.listdir(output_dir) and args.do_train: logging.warning( "Output directory ({}) already exists and is not empty.".format( output_dir)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Its way ot easy to forget if this is being set by a command line flag if "-uncased" in args.bert_model: do_lower_case = True elif "-cased" in args.bert_model: do_lower_case = False else: raise NotImplementedError(args.bert_model) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=do_lower_case) num_train_optimization_steps = None train_examples = None if args.do_train: train_examples = load_mnli(True, args.debug_num if args.debug else None) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) loss_fn.num_train_optimization_steps = int( num_train_optimization_steps) loss_fn.num_epochs = int(args.num_train_epochs) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format( args.local_rank)) model = BertDistill.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels=3, loss_fn=loss_fn) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features: List[InputFeatures] = convert_examples_to_features( train_examples, args.max_seq_length, tokenizer, args.n_processes) if args.which_bias == "mix": hypo_bias_map = load_bias("hypo") hans_bias_map = load_bias("hans") bias_map = {} def compute_entropy(probs, base=3): return -(probs * (np.log(probs) / np.log(base))).sum() for key in hypo_bias_map.keys(): hypo_ent = compute_entropy(np.exp(hypo_bias_map[key])) hans_ent = compute_entropy(np.exp(hans_bias_map[key])) if hypo_ent < hans_ent: bias_map[key] = hypo_bias_map[key] else: bias_map[key] = hans_bias_map[key] else: bias_map = load_bias(args.which_bias, custom_path=args.custom_bias) for fe in train_features: fe.bias = bias_map[fe.example_id].astype(np.float32) teacher_probs_map = load_teacher_probs(args.custom_teacher) for fe in train_features: fe.teacher_probs = np.array( teacher_probs_map[fe.example_id]).astype(np.float32) example_map = {} for ex in train_examples: example_map[ex.id] = ex logging.info("***** Running training *****") logging.info(" Num examples = %d", len(train_examples)) logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) train_dataloader = build_train_dataloader(train_features, args.train_batch_size, args.seed, args.sorted) model.train() loss_ema = 0 total_steps = 0 decay = 0.99 for _ in trange(int(args.num_train_epochs), desc="Epoch", ncols=100): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 pbar = tqdm(train_dataloader, desc="loss", ncols=100) for step, batch in enumerate(pbar): batch = tuple(t.to(device) for t in batch) if bias_map is not None: example_ids, input_ids, input_mask, segment_ids, label_ids, bias, teacher_probs = batch else: bias = None example_ids, input_ids, input_mask, segment_ids, label_ids = batch logits, loss = model(input_ids, segment_ids, input_mask, label_ids, bias, teacher_probs) total_steps += 1 loss_ema = loss_ema * decay + loss.cpu().detach().numpy() * ( 1 - decay) descript = "loss=%.4f" % (loss_ema / (1 - decay**total_steps)) pbar.set_description(descript, refresh=False) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Record the args as well arg_dict = {} for arg in vars(args): arg_dict[arg] = getattr(args, arg) with open(join(output_dir, "args.json"), 'w') as out_fh: json.dump(arg_dict, out_fh) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertDistill(config, num_labels=3, loss_fn=loss_fn) model.load_state_dict(torch.load(output_model_file)) else: output_config_file = os.path.join(output_dir, CONFIG_NAME) config = BertConfig.from_json_file(output_config_file) output_model_file = os.path.join(output_dir, WEIGHTS_NAME) model = BertDistill(config, num_labels=3, loss_fn=loss_fn) model.load_state_dict(torch.load(output_model_file)) model.to(device) if not args.do_eval and not args.do_test: return if not (args.local_rank == -1 or torch.distributed.get_rank() == 0): return model.eval() if args.do_eval: eval_datasets = [("mnli_dev_m", load_mnli(False)), ("mnli_dev_mm", load_mnli(False, custom_path="dev_mismatched.tsv"))] # eval_datasets += load_easy_hard(prefix="overlap_", no_mismatched=True) eval_datasets += load_easy_hard() eval_datasets += [("hans", load_hans())] eval_datasets += load_hans_subsets() # stress test # eval_datasets += [("negation_m", load_jsonl("multinli_0.9_negation_matched.jsonl", # "../dataset/StressTests/Negation"))] # eval_datasets += [("negation_mm", load_jsonl("multinli_0.9_negation_mismatched.jsonl", # "../dataset/StressTests/Negation"))] # eval_datasets += [("overlap_m", load_jsonl("multinli_0.9_taut2_matched.jsonl", # "../dataset/StressTests/Word_Overlap"))] # eval_datasets += [("overlap_mm", load_jsonl("multinli_0.9_taut2_mismatched.jsonl", # "../dataset/StressTests/Word_Overlap"))] # eval_datasets += [("length_m", load_jsonl("multinli_0.9_length_mismatch_matched.jsonl", # "../dataset/StressTests/Length_Mismatch"))] # eval_datasets += [("length_mm", load_jsonl("multinli_0.9_length_mismatch_mismatched.jsonl", # "../dataset/StressTests/Length_Mismatch"))] # eval_datasets = [("rte", load_jsonl("eval_rte.jsonl", # "../dataset/mnli_eval_suite"))] # eval_datasets += [("rte_glue", load_jsonl("eval_glue_rte.jsonl", # "../dataset/mnli_eval_suite"))] # eval_datasets += [("sick", load_jsonl("eval_sick.jsonl", # "../dataset/mnli_eval_suite"))] # eval_datasets += [("diagnostic", load_jsonl("diagnostic-full.jsonl", # "../dataset/mnli_eval_suite"))] # eval_datasets += [("scitail", load_jsonl("scitail_1.0_test.txt", # "../dataset/scitail/snli_format"))] # todo delete if args.do_eval_on_train: eval_datasets = [("mnli_train", load_mnli(True))] else: eval_datasets = [] if args.do_test: test_datasets = load_all_test_jsonl() eval_datasets += test_datasets subm_paths = [ "../submission/{}.csv".format(x[0]) for x in test_datasets ] for ix, (name, eval_examples) in enumerate(eval_datasets): logging.info("***** Running evaluation on %s *****" % name) logging.info(" Num examples = %d", len(eval_examples)) logging.info(" Batch size = %d", args.eval_batch_size) eval_features = convert_examples_to_features(eval_examples, args.max_seq_length, tokenizer) eval_features.sort(key=lambda x: len(x.input_ids)) all_label_ids = np.array([x.label_id for x in eval_features]) eval_dataloader = build_eval_dataloader(eval_features, args.eval_batch_size) eval_loss = 0 nb_eval_steps = 0 probs = [] test_subm_ids = [] for example_ids, input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating", ncols=100): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask) # create eval loss and other metric required by the task loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, 3), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 probs.append( torch.nn.functional.softmax(logits, 1).detach().cpu().numpy()) test_subm_ids.append(example_ids.cpu().numpy()) probs = np.concatenate(probs, 0) test_subm_ids = np.concatenate(test_subm_ids, 0) eval_loss = eval_loss / nb_eval_steps if "hans" in name: # take max of non-entailment rather than taking their sum probs[:, 0] = probs[:, [0, 2]].max(axis=1) # probs[:, 0] = probs[:, 0] + probs[:, 2] probs = probs[:, :2] preds = np.argmax(probs, axis=1) result = {"acc": simple_accuracy(preds, all_label_ids)} result["loss"] = eval_loss conf_plot_file = os.path.join(output_dir, "eval_%s_confidence.png" % name) ECE, bins_acc, bins_conf, bins_num = visualize_predictions( probs, all_label_ids, conf_plot_file=conf_plot_file) result["ECE"] = ECE result["bins_acc"] = bins_acc result["bins_conf"] = bins_conf result["bins_num"] = bins_num output_eval_file = os.path.join(output_dir, "eval_%s_results.txt" % name) output_all_eval_file = os.path.join(output_dir, "eval_all_results.txt") with open(output_eval_file, "w") as writer, open(output_all_eval_file, "a") as all_writer: logging.info("***** Eval results *****") all_writer.write("eval results on %s:\n" % name) for key in sorted(result.keys()): logging.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) all_writer.write("%s = %s\n" % (key, str(result[key]))) output_answer_file = os.path.join(output_dir, "eval_%s_answers.json" % name) answers = { ex.example_id: [float(x) for x in p] for ex, p in zip(eval_features, probs) } with open(output_answer_file, "w") as f: json.dump(answers, f) # prepare submission file if args.do_test and ix >= len(eval_datasets) - len(test_datasets): with open(subm_paths.pop(0), "w") as subm_f: subm_f.write("pairID,gold_label\n") for sub_id, pred_label_id in zip(test_subm_ids, preds): subm_f.write("{},{}\n".format( str(sub_id), REV_NLI_LABEL_MAP[pred_label_id]))
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, **kwargs ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): Used to hide legacy arguments that have been deprecated. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import RobertaTokenizer, RobertaForMaskedLM import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForMaskedLM.from_pretrained('roberta-base') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=input_ids) loss, prediction_scores = outputs[:2] """ if "masked_lm_labels" in kwargs: warnings.warn( "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", DeprecationWarning, ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, ) sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) outputs = (masked_lm_loss,) + outputs return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
def forward( self, input_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, **deprecated_arguments ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ if deprecated_arguments.pop("position_ids", False) is not False: # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` warnings.warn( "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" " passing `position_ids`.", FutureWarning, ) if len(deprecated_arguments) > 0: raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, past_key_values=past_key_values, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = transformer_outputs[0] logits = self.score(hidden_states) if input_ids is not None: batch_size = input_ids.shape[0] else: batch_size = inputs_embeds.shape[0] if self.config.pad_token_id is None and batch_size != 1: raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") if self.config.pad_token_id is None: sequence_lengths = -1 else: if input_ids is not None: sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(dim=-1) - 1 else: sequence_lengths = -1 logger.warning( f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " "unexpected if using padding tokens in conjunction with `inputs_embeds.`" ) pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] loss = None if labels is not None: if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" if self.config.problem_type == "regression": loss_fct = MSELoss() if self.num_labels == 1: loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) else: loss = loss_fct(pooled_logits, labels) elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(pooled_logits, labels) elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(pooled_logits, labels) if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output return SequenceClassifierOutputWithPast( loss=loss, logits=pooled_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, )
pretrained_embeddings = load_pretrained_embeddings(embeddings_path, train_dataset.word2idx, 300, is_crf=crf_model) name_ = 'LSTM' hp = HyperParameters(name_, train_dataset.word2idx, train_dataset.labels2idx, pretrained_embeddings, batch_size) # , collate_fn=DatasetParser.pad_collate train_dataset_ = DataLoader(dataset=train_dataset, batch_size=batch_size) dev_dataset_ = DataLoader(dataset=dev_dataset, batch_size=batch_size) test_dataset_ = DataLoader(dataset=test_dataset, batch_size=batch_size) model = BaselineModel(hp).to(train_dataset.get_device) trainer = Trainer( model=model, loss_function=CrossEntropyLoss(ignore_index=train_dataset.labels2idx['<PAD>']), optimizer=Adam(model.parameters()), batch_num=hp.batch_size, num_classes=hp.num_classes, verbose=True ) save_to_ = join(RESOURCES_PATH, f"{model.name}_model.pt") trainer.train(train_dataset_, dev_dataset_, epochs=1, save_to=save_to_) evaluator = Evaluator(model, test_dataset_, crf_model) evaluator.check_performance(train_dataset.idx2label)
def forward( self, input_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, **deprecated_arguments ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ if deprecated_arguments.pop("position_ids", False) is not False: # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` warnings.warn( "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" " passing `position_ids`.", FutureWarning, ) if len(deprecated_arguments) > 0: raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, past_key_values=past_key_values, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = transformer_outputs[0] hidden_states = self.dropout(hidden_states) logits = self.classifier(hidden_states) loss = None if labels is not None: batch_size, seq_length = labels.shape loss_fct = CrossEntropyLoss() loss = loss_fct( logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length) ) if not return_dict: output = (logits,) + transformer_outputs[2:] return ((loss,) + output) if loss is not None else output return TokenClassifierOutput( loss=loss, logits=logits, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, )
def forward( self, input_ids: Optional[torch.LongTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, inputs_embeds: Optional[torch.Tensor] = None, labels: Optional[torch.Tensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, **deprecated_arguments ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: r""" labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` """ if deprecated_arguments.pop("position_ids", False) is not False: # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` warnings.warn( "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" " passing `position_ids`.", FutureWarning, ) if len(deprecated_arguments) > 0: raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, past_key_values=past_key_values, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) loss = None if labels is not None: # Shift so that tokens < n predict n shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() batch_size, seq_length, vocab_size = shift_logits.shape # Flatten the tokens loss_fct = CrossEntropyLoss() loss = loss_fct( shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length) ) if not return_dict: output = (lm_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output return CausalLMOutputWithCrossAttentions( loss=loss, logits=lm_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, )
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, lm_labels=None, ): outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) outputs = (prediction_scores, ) + outputs[ 2:] # Add hidden states and attention if they are here # Although this may seem awkward, BertForMaskedLM supports two scenarios: # 1. If a tensor that contains the indices of masked labels is provided, # the cross-entropy is the MLM cross-entropy that measures the likelihood # of predictions for masked words. # 2. If `lm_labels` is provided we are in a causal scenario where we # try to predict the next token for each input in the decoder. if masked_lm_labels is not None: loss_fct = CrossEntropyLoss( ignore_index=-1, reduction='none') # -1 index = padding token masked_lm_loss = loss_fct(prediction_scores.permute(0, 2, 1), masked_lm_labels) masked_lm_loss_normalized = torch.div( torch.mean(masked_lm_loss, 1), (masked_lm_labels > -1).sum(dim=1, dtype=torch.float32)) masked_lm_loss_normalized[torch.isnan( masked_lm_loss_normalized)] = 0.0 outputs = (masked_lm_loss_normalized, ) + outputs if lm_labels is not None: # we are doing next-token prediction; shift prediction scores and input ids by one prediction_scores = prediction_scores[:, :-1, :].contiguous() lm_labels = lm_labels[:, 1:].contiguous() loss_fct = CrossEntropyLoss(ignore_index=-1) ltr_lm_loss = loss_fct( prediction_scores.view(-1, self.config.vocab_size), lm_labels.view(-1)) outputs = (ltr_lm_loss, ) + outputs return outputs # (masked_lm_loss), (ltr_lm_loss), prediction_scores, (hidden_states), (attentions)
def configure_weighted_loss(self): balanced = self.train_df['label'].mean() weights = torch.tensor([1 - balanced, balanced]) weights = to_gpu(weights) return CrossEntropyLoss(weight=weights)
# Prepare dataloaders train_loader = DataLoader(train_set, batch_size=2000, shuffle=True, num_workers=4) # train_set: bastch_size = targets.shape / 500 val_loader = DataLoader(val_set, batch_size=4000, shuffle=False, num_workers=4) # train_set: bastch_size : larger is better if the GPU memory is enough. # num_workers = number of cpu core, but limited by the disk speed. so 8 is good. # Prepare trainer trainer = Trainer(cpic(), CrossEntropyLoss(), lr=0.1) # Train model over training dataset trainer.train(train_loader, val_loader, epochs=300, print_freq=100) #resume='checkpoint_best.pth.tar') # Save training results to disk trainer.results(path='Taiwan20092010_results.pth.tar') # Validate saved model results = torch.load('Taiwan20092010_results.pth.tar') model = cpic() model.load_state_dict(results['model']) trainer = Trainer(model, CrossEntropyLoss(), lr=0.1) trainer.validate(val_loader, print_freq=100)
def forward( self, input_ids: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, SequenceClassifierOutputWithPast]: r""" labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.model( input_ids, past_key_values=past_key_values, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = transformer_outputs[0] logits = self.score(hidden_states) if input_ids is not None: batch_size, sequence_length = input_ids.shape[:2] else: batch_size, sequence_length = inputs_embeds.shape[:2] if self.config.pad_token_id is None: sequence_lengths = -1 else: if input_ids is not None: sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1 else: sequence_lengths = -1 logger.warning( f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " "unexpected if using padding tokens in conjunction with `inputs_embeds.`" ) pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] loss = None if labels is not None: if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" if self.config.problem_type == "regression": loss_fct = MSELoss() if self.num_labels == 1: loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) else: loss = loss_fct(pooled_logits, labels) elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) elif self.config.problem_type == "multi_label_classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(pooled_logits, labels) if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output return SequenceClassifierOutputWithPast( loss=loss, logits=pooled_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, )
def main(): # noqa C901 parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the training files for the CoNLL-2003 NER task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) ## Other parameters parser.add_argument( "--labels", default="", type=str, help= "Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name", ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=512, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument( "--do_predict", action="store_true", help="Whether to run predictions on the test set.", ) parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model.", ) parser.add_argument( "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.", ) parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.", ) parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") parser.add_argument( "--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.", ) parser.add_argument( "--eval_all_checkpoints", action="store_true", help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory", ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets", ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument( "--local_rank", type=int, default=-1, help="For distributed training: local_rank", ) parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") args = parser.parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train): if not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) else: if args.local_rank in [-1, 0]: shutil.rmtree(args.output_dir) if not os.path.exists(args.output_dir) and (args.do_eval or args.do_predict): raise ValueError( "Output directory ({}) does not exist. Please train and save the model before inference stage." .format(args.output_dir)) if (not os.path.exists(args.output_dir) and args.do_train and args.local_rank in [-1, 0]): os.makedirs(args.output_dir) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( filename=os.path.join(args.output_dir, "train.log") if args.local_rank in [-1, 0] else None, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) labels = get_labels(args.labels) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = FunsdDataset(args, tokenizer, labels, pad_token_label_id, mode="train") global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result, _ = evaluate( args, model, tokenizer, labels, pad_token_label_id, mode="test", prefix=global_step, ) if global_step: result = { "{}_{}".format(global_step, k): v for k, v in result.items() } results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) if args.do_predict and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(args.output_dir) model.to(args.device) result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test") # Save results output_test_results_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_results_file, "w") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt") with open(output_test_predictions_file, "w", encoding="utf8") as writer: with open(os.path.join(args.data_dir, "test.txt"), "r", encoding="utf8") as f: example_id = 0 for line in f: if line.startswith( "-DOCSTART-") or line == "" or line == "\n": writer.write(line) if not predictions[example_id]: example_id += 1 elif predictions[example_id]: output_line = (line.split()[0] + " " + predictions[example_id].pop(0) + "\n") writer.write(output_line) else: logger.warning( "Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0], ) return results
def forward( self, input_ids: torch.LongTensor = None, attention_mask: Optional[torch.Tensor] = None, head_mask: Optional[torch.Tensor] = None, past_key_values: Optional[List[torch.FloatTensor]] = None, inputs_embeds: Optional[torch.FloatTensor] = None, labels: Optional[torch.LongTensor] = None, use_cache: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, CausalLMOutputWithPast]: r""" Args: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it. Indices can be obtained using [`OPTTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) head_mask (`torch.Tensor` of shape `(num_hidden_layers, num_attention_heads)`, *optional*): Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: - 1 indicates the head is **not masked**, - 0 indicates the head is **masked**. past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional tensors are only required when the model is used as a decoder in a Sequence to Sequence model. Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. use_cache (`bool`, *optional*): If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`). output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. Returns: Example: ```python >>> from transformers import GPT2Tokenizer, OPTForCausalLM >>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m") >>> tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350m") >>> prompt = "Hey, are you consciours? Can you talk to me?" >>> inputs = tokenizer(prompt, return_tensors="pt") >>> # Generate >>> generate_ids = model.generate(inputs.input_ids, max_length=30) >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you." ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) outputs = self.model.decoder( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) logits = self.lm_head(outputs[0]).contiguous() loss = None if labels is not None: # Shift so that tokens < n predict n shift_logits = logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1)) if not return_dict: output = (logits,) + outputs[1:] return (loss,) + output if loss is not None else output return CausalLMOutputWithPast( loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )
def __init__(self, hparams): self.labels = get_labels(hparams.labels) num_labels = len(self.labels) self.pad_token_label_id = CrossEntropyLoss().ignore_index super().__init__(hparams, num_labels, self.mode)
def __init__(self, opt): super(AdvisorAgent, self).__init__() self.model_name = opt['model_name'] self.evaluate_every = opt['evaluate_every_steps'] if self.model_name == 'advisor': Module = Advisor elif self.model_name == 'hred_db': Module = HRED_DB elif self.model_name == 'hred_db0': Module = HRED_DB0 elif self.model_name == 'hred': Module = HRED else: Module = BiLSTM opt['cuda'] = not opt['no_cuda'] and torch.cuda.is_available() if opt['cuda']: print('[ Using CUDA ]') torch.cuda.device(opt['gpu']) # torch.cuda.device([0, 1]) # It enables benchmark mode in cudnn, which # leads to faster runtime when the input sizes do not vary. cudnn.benchmark = True self.opt = opt if opt['pre_word2vec']: pre_w2v = load_ndarray(opt['pre_word2vec']) else: pre_w2v = None self.evaluator = Evaluator(CrossEntropyLoss(), batch_size=opt['batch_size'], use_cuda=opt['cuda'], model_name=self.model_name) self.score_type = 'clf' self.model = Module(opt['vocab_size'], \ opt['word_emb_size'], \ opt['hidden_size'], \ init_w2v=pre_w2v, \ enc_type=opt['enc_type'], \ rnn_type=opt['rnn_type'], \ bidirectional=not opt['no_bidirectional'], \ utter_enc_dropout=opt['utter_enc_dropout'], \ knowledge_enc_dropout=opt['knowledge_enc_dropout'], \ atten_type=opt['atten_type'], \ score_type=self.score_type, \ use_cuda=opt['cuda']#, \ # phase=opt['phase'] ) if opt['cuda']: self.model.cuda() # self.model = torch.nn.DataParallel(self.model, device_ids=[0, 1]) if self.score_type == 'ranking': # MultiLabelMarginLoss # For each sample in the mini-batch: # loss(x, y) = sum_ij(max(0, 1 - (x[y[j]] - x[i]))) / x.size(0) self.loss_fn = MultiLabelMarginLoss() else: self.loss_fn = CrossEntropyLoss() optim_params = [p for p in self.model.parameters() if p.requires_grad] lr = opt['learning_rate'] if opt['optimizer'] == 'sgd': self.optimizers = {self.model_name: optim.SGD(optim_params, lr=lr)} elif opt['optimizer'] == 'adam': self.optimizers = { self.model_name: optim.Adam(optim_params, lr=lr) } elif opt['optimizer'] == 'adadelta': self.optimizers = { self.model_name: optim.Adadelta(optim_params, lr=lr) } elif opt['optimizer'] == 'adagrad': self.optimizers = { self.model_name: optim.Adagrad(optim_params, lr=lr) } elif opt['optimizer'] == 'adamax': self.optimizers = { self.model_name: optim.Adamax(optim_params, lr=lr) } elif opt['optimizer'] == 'rmsprop': self.optimizers = { self.model_name: optim.RMSprop(optim_params, lr=lr) } else: raise NotImplementedError('Optimizer not supported.') self.scheduler = ReduceLROnPlateau(self.optimizers[self.model_name], mode='min', \ patience=opt['valid_patience'] // 3, verbose=True) if opt.get('model_file') and os.path.isfile(opt['model_file']): print('Loading existing model parameters from ' + opt['model_file']) self.load(opt['model_file'])
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, start_positions=None, end_positions=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the start of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for position (index) of the end of the labelled span for computing the token classification loss. Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] logits = self.qa_outputs(sequence_output) start_logits, end_logits = logits.split(1, dim=-1) start_logits = start_logits.squeeze(-1) end_logits = end_logits.squeeze(-1) total_loss = None if start_positions is not None and end_positions is not None: # If we are on multi-GPU, split add a dimension if len(start_positions.size()) > 1: start_positions = start_positions.squeeze(-1) if len(end_positions.size()) > 1: end_positions = end_positions.squeeze(-1) # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.size(1) start_positions.clamp_(0, ignored_index) end_positions.clamp_(0, ignored_index) loss_fct = CrossEntropyLoss(ignore_index=ignored_index) start_loss = loss_fct(start_logits, start_positions) end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 if not return_dict: output = (start_logits, end_logits) + outputs[2:] return ((total_loss, ) + output) if total_loss is not None else output return QuestionAnsweringModelOutput( loss=total_loss, start_logits=start_logits, end_logits=end_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import RobertaTokenizer, RobertaForSequenceClassification import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForSequenceClassification.from_pretrained('roberta-base') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] """ outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, ) sequence_output = outputs[0] logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), logits, (hidden_states), (attentions)
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.", ) parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS), ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name" ) parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name", ) parser.add_argument( "--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3", ) parser.add_argument( "--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.") parser.add_argument( "--evaluate_during_training", action="store_true", help="Whether to run evaluation during training at each logging step.", ) parser.add_argument( "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model." ) parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation." ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--adam_beta1", default=0.9, type=float, help="BETA1 for Adam optimizer.") parser.add_argument("--adam_beta2", default=0.999, type=float, help="BETA2 for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform." ) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action="store_true", help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number", ) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument( "--fp16_opt_level", type=str, default="O1", help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.") parser.add_argument("--server_port", type=str, default="", help="For distant debugging.") # mean teacher parser.add_argument('--mt', type = int, default = 0, help = 'mean teacher.') parser.add_argument('--mt_updatefreq', type=int, default=1, help = 'mean teacher update frequency') parser.add_argument('--mt_class', type=str, default="kl", help = 'mean teacher class, choices:[smart, prob, logit, kl(default), distill].') parser.add_argument('--mt_lambda', type=float, default=1, help= "trade off parameter of the consistent loss.") parser.add_argument('--mt_rampup', type=int, default=300, help="rampup iteration.") parser.add_argument('--mt_alpha1', default=0.99, type=float, help="moving average parameter of mean teacher (for the exponential moving average).") parser.add_argument('--mt_alpha2', default=0.995, type=float, help="moving average parameter of mean teacher (for the exponential moving average).") parser.add_argument('--mt_beta', default=10, type=float, help="coefficient of mt_loss term.") parser.add_argument('--mt_avg', default="exponential", type=str, help="moving average method, choices:[exponentail(default), simple, double_ema].") parser.add_argument('--mt_loss_type', default="logits", type=str, help="subject to measure model difference, choices:[embeds, logits(default)].") # virtual adversarial training parser.add_argument('--vat', type = int, default = 0, help = 'virtual adversarial training.') parser.add_argument('--vat_eps', type = float, default = 1e-3, help = 'perturbation size for virtual adversarial training.') parser.add_argument('--vat_lambda', type = float, default = 1, help = 'trade off parameter for virtual adversarial training.') parser.add_argument('--vat_beta', type = float, default = 1, help = 'coefficient of the virtual adversarial training loss term.') parser.add_argument('--vat_loss_type', default="logits", type=str, help="subject to measure model difference, choices = [embeds, logits(default)].") # Use data from weak.json parser.add_argument('--load_weak', action="store_true", help = 'Load data from weak.json.') parser.add_argument('--remove_labels_from_weak', action="store_true", help = 'Use data from weak.json, and remove their labels for semi-supervised learning') parser.add_argument('--rep_train_against_weak', type = int, default = 1, help = 'Upsampling training data again weak data. Default: 1') args = parser.parse_args() if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir ) ) # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logging_fh = logging.FileHandler(os.path.join(args.output_dir, 'log.txt')) logging_fh.setLevel(logging.DEBUG) logger.addHandler(logging_fh) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) labels = get_labels(args.data_dir) num_labels = len(labels) # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later pad_token_label_id = CrossEntropyLoss().ignore_index # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train") # import ipdb; ipdb.set_trace() if args.load_weak: weak_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="weak", remove_labels=args.remove_labels_from_weak) train_dataset = torch.utils.data.ConcatDataset([train_dataset]*args.rep_train_against_weak + [weak_dataset,]) global_step, tr_loss, best_dev, best_test = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving last-practice: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("Saving model checkpoint to %s", args.output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, "training_args.bin")) torch.save(model.state_dict(), os.path.join(args.output_dir, "model.pt")) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)) ) logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) if not best_dev: best_dev = [0, 0, 0] for checkpoint in checkpoints: global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result, _, best_dev, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, best=best_dev, mode="dev", prefix=global_step) if global_step: result = {"{}_{}".format(global_step, k): v for k, v in result.items()} results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key]))) if args.do_predict and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(args.output_dir) model.to(args.device) if not best_test: best_test = [0, 0, 0] result, predictions, _, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, best=best_test, mode="test") # Save results output_test_results_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_results_file, "w") as writer: for key in sorted(result.keys()): writer.write("{} = {}\n".format(key, str(result[key]))) # Save predictions output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt") with open(output_test_predictions_file, "w") as writer: with open(os.path.join(args.data_dir, "test.json"), "r") as f: example_id = 0 data = json.load(f) for item in data: output_line = str(item["str_words"]) + " " + predictions[example_id].pop(0) + "\n" writer.write(output_line) example_id += 1 return results
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import RobertaTokenizer, RobertaForTokenClassification import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForTokenClassification.from_pretrained('roberta-base') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] """ outputs = self.roberta( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, ) sequence_output = outputs[0] sequence_output = self.dropout(sequence_output) logits = self.classifier(sequence_output) outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here if labels is not None: loss_fct = CrossEntropyLoss() # Only keep active parts of the loss if attention_mask is not None: active_loss = attention_mask.view(-1) == 1 active_logits = logits.view(-1, self.num_labels) active_labels = torch.where( active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) ) loss = loss_fct(active_logits, active_labels) else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss,) + outputs return outputs # (loss), scores, (hidden_states), (attentions)
def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None, position_ids=None, head_mask=None, input_ids_a=None, token_type_ids_a=None, attention_mask_a=None, input_ids_b=None, token_type_ids_b=None, attention_mask_b=None): # outputs_original = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids, # attention_mask=attention_mask, head_mask=head_mask) # pooled_output = self.dropout(pooled_output) outputs_a = self.bert(input_ids_a, position_ids=None, token_type_ids=token_type_ids_a, attention_mask=attention_mask_a) outputs_b = self.bert(input_ids_b, position_ids=None, token_type_ids=token_type_ids_b, attention_mask=attention_mask_b) if self.later_model_type == 'linear': pooled_output_a = outputs_a[1] pooled_output_a = self.dropout(pooled_output_a) pooled_output_b = outputs_b[1] pooled_output_b = self.dropout(pooled_output_b) pooled_output = torch.cat((pooled_output_a, pooled_output_b, pooled_output_a - pooled_output_b), dim=1) pooled_output = self.projection(pooled_output) pooled_output = self.projection_activation(pooled_output) pooled_output = self.projection_dropout(pooled_output) elif self.later_model_type == '1bert_layer': encoder_outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask) bert_1stlayer = encoder_outputs[1] # pooled_output = self.bert_pooler(bert_1stlayer) pooled_output = self.dropout(bert_1stlayer) elif self.later_model_type == 'bilinear': question_hiddens = outputs_a[0] question_hiddens = self.dropout(question_hiddens) doc_hiddens = outputs_b[0] doc_hiddens = self.dropout(doc_hiddens) question_mask = (1 - attention_mask_a).to(torch.bool) doc_mask = (1 - attention_mask_b).to(torch.bool) x2_weighted_emb = self.qemb_match(doc_hiddens, question_hiddens, question_mask) doc_hiddens = torch.cat((doc_hiddens, x2_weighted_emb), 2) doc_hiddens = self.doc_rnn(doc_hiddens, doc_mask) question_hidden = outputs_a[1] question_hidden = self.qs_proj(question_hidden) question_hidden = self.bilinear_dropout(question_hidden) doc_hiddens = self.doc_proj(doc_hiddens) doc_hiddens = self.bilinear_dropout(doc_hiddens) question_hidden = question_hidden.unsqueeze(1).expand_as( doc_hiddens).contiguous() doc_hiddens = self.bilinear(doc_hiddens, question_hidden) pooled_output = doc_hiddens.max(dim=1)[0] elif self.later_model_type == 'transformer': input_ids = torch.cat((input_ids_a, input_ids_b), dim=1) bert_embeddings_a = outputs_a[0] bert_embeddings_b = outputs_b[0] embeddings_cat = torch.cat((bert_embeddings_a, bert_embeddings_b), dim=1) attention_mask = torch.cat((attention_mask_a, attention_mask_b), dim=1) extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) extended_attention_mask = extended_attention_mask.to( dtype=next(self.parameters()).dtype) extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 token_type_ids_a = torch.zeros_like(token_type_ids_a) token_type_ids_b = torch.ones_like(token_type_ids_b) token_type_ids = torch.cat((token_type_ids_a, token_type_ids_b), dim=1) token_type_ids_emb = self.bert_type_id_emb(token_type_ids) seq_length = embeddings_cat.size(1) if position_ids is None: position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids_a.device) position_ids = position_ids.unsqueeze(0).expand_as(input_ids) embeddings_cat_position_emb = self.bert_position_emb(position_ids) transformer_input = embeddings_cat + embeddings_cat_position_emb + token_type_ids_emb transformer_outputs = self.bert_layer(transformer_input, extended_attention_mask) pooled_output = self.bert_pooler_qd( transformer_outputs[0], question_size=input_ids_a.size(1)) logits = self.classifier(pooled_output) outputs = (logits, ) + outputs_a[2:] + outputs_b[ 2:] # add hidden states and attention if they are here if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss, ) + outputs return outputs # (loss), logits, (hidden_states), (attentions)
def compute_loss(self, task_logits: torch.FloatTensor, labels: torch.LongTensor, task_ind, task_num_labels, batch_size, seq_len, state: dict, ) -> None: """ Computes the loss for a single batch and a single task. Args: task_logits: labels: task_ind: task_num_labels: batch_size: seq_len: state: """ if task_num_labels == 1: # We are doing regression loss_fct = MSELoss() task_loss = loss_fct(task_logits.view(-1), labels.view(-1)) else: if not self.class_weights[task_ind] is None: class_weights = torch.FloatTensor(self.class_weights[task_ind]).to(self.device) else: class_weights = None loss_fct = CrossEntropyLoss(weight=class_weights) if self.relations[task_ind]: task_labels = labels[:, 0, state['task_label_ind']:state['task_label_ind'] + seq_len, :] state['task_label_ind'] += seq_len task_loss = loss_fct(task_logits.permute(0, 3, 1, 2), task_labels.type(torch.LongTensor).to(labels.device)) elif self.tagger[task_ind]: # in cases where we are only given a single task the HF code will have one fewer dimension in the labels, so just add a dummy dimension to make our indexing work: if labels.ndim == 3: labels = labels.unsqueeze(1) task_labels = labels[:, 0, state['task_label_ind'], :] state['task_label_ind'] += 1 task_loss = loss_fct(task_logits.view(-1, task_num_labels), task_labels.reshape([batch_size * seq_len, ]).type(torch.LongTensor).to( labels.device)) else: if labels.ndim == 2: task_labels = labels[:, 0] elif labels.ndim == 3: task_labels = labels[:, 0, task_ind] else: raise NotImplementedError( 'Have not implemented the case where a classification task ' 'is part of an MTL setup with relations and sequence tagging') state['task_label_ind'] += 1 task_loss = loss_fct(task_logits, task_labels.type(torch.LongTensor).to(labels.device)) if state['loss'] is None: state['loss'] = task_loss else: task_weight = 1.0 if task_ind + 1 < len(self.num_labels) else self.final_task_weight state['loss'] += (task_weight * task_loss)
history['val_loss'].append(val_loss) history['val_acc'].append(val_acc) history['best_model'] = model if history['max_val_acc'] < val_acc else history['best_model'] history['max_val_acc'] = max(history['max_val_acc'], val_acc) print('epoch:' + str(epoch)) print('train_loss: {:.6f}'.format(loss)) print('train_acc: {:.6f}'.format(acc)) print('valid_loss: {:.6f}'.format(val_loss)) print('valid_acc: {:.6f}'.format(val_acc)) test_flag = False opt = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) # opt = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9) loss_func = CrossEntropyLoss() # train_ds = TensorDataset(X_train, Y_train) # valid_ds = TensorDataset(X_test, Y_test) # train_ds = MyDataset(X_train, Y_train) # valid_ds = MyDataset(X_test, Y_test) # train_dl, valid_dl = get_data(train_ds, valid_ds, batch_size) # train_dl = WrappedDataLoader(train_dl, preprocess) # valid_dl = WrappedDataLoader(valid_dl, preprocess) # train_dl = WrappedDataLoader( train_loader, preprocess) # valid_dl = WrappedDataLoader(test_loader, preprocess) # fit(epochs, model, loss_func, opt, train_dl, valid_dl) fit(epochs, model, loss_func, opt, X_train.to(device), Y_train.to(device), X_test.to(device), Y_test.to(device)) #draw history
def forward( self, input_ids=None, bbox=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, encoder_hidden_states=None, encoder_attention_mask=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: Examples:: >>> from transformers import LayoutLMTokenizer, LayoutLMForMaskedLM >>> import torch >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased') >>> model = LayoutLMForMaskedLM.from_pretrained('microsoft/layoutlm-base-uncased') >>> words = ["Hello", "[MASK]"] >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782] >>> token_boxes = [] >>> for word, box in zip(words, normalized_word_boxes): ... word_tokens = tokenizer.tokenize(word) ... token_boxes.extend([box] * len(word_tokens)) >>> # add bounding boxes of cls + sep tokens >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]] >>> encoding = tokenizer(' '.join(words), return_tensors="pt") >>> input_ids = encoding["input_ids"] >>> attention_mask = encoding["attention_mask"] >>> token_type_ids = encoding["token_type_ids"] >>> bbox = torch.tensor([token_boxes]) >>> labels = tokenizer("Hello world", return_tensors="pt")["input_ids"] >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids, ... labels=labels) >>> loss = outputs.loss """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.layoutlm( input_ids, bbox, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] prediction_scores = self.cls(sequence_output) masked_lm_loss = None if labels is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct( prediction_scores.view(-1, self.config.vocab_size), labels.view(-1), ) if not return_dict: output = (prediction_scores,) + outputs[2:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output return MaskedLMOutput( loss=masked_lm_loss, logits=prediction_scores, hidden_states=outputs.hidden_states, attentions=outputs.attentions, )
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, position_ids=None, head_mask=None, h_ids=None, h_attention_mask=None, p_ids=None, p_attention_mask=None, have_overlap=None, overlap_rate=None, subsequence=None, constituent=None, binary_labels=None): if self.hypothesis_only: outputs = self.bert(h_ids, token_type_ids=None, attention_mask=h_attention_mask) pooled_h = outputs[1] pooled_h_g = self.dropout(pooled_h) logits = self.h_classifier1(pooled_h_g) outputs = (logits, ) + outputs[2:] elif not self.hans_only: outputs = self.bert(input_ids, position_ids=position_ids, \ token_type_ids=token_type_ids, \ attention_mask=attention_mask, head_mask=head_mask) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) # add hidden states and attention if they are here outputs = (logits, ) + outputs[2:] if self.hans: # if both are correct. h_outputs = self.bert(h_ids, token_type_ids=None, attention_mask=h_attention_mask) if self.ensemble_training: # also computes the h-only results. pooled_h_second = h_outputs[1] h_embd_second = grad_mul_const(pooled_h_second, 0.0) pooled_h_g_second = self.dropout(h_embd_second) h_logits_second = self.h_classifier1_second(pooled_h_g_second) h_outputs_second = (h_logits_second, ) + h_outputs[2:] h_matrix = h_outputs[0] h_matrix = grad_mul_const(h_matrix, 0.0) h_matrix = self.dropout(h_matrix) p_outputs = self.bert(p_ids, token_type_ids=None, attention_mask=p_attention_mask) p_matrix = p_outputs[0] p_matrix = grad_mul_const(p_matrix, 0.0) p_matrix = self.dropout(p_matrix) # compute similarity features. if self.hans_features: simialrity_score = get_word_similarity_new(h_matrix, p_matrix, self.similarity, \ h_attention_mask, p_attention_mask) # this is the default case. hans_h_inputs = torch.cat((simialrity_score, \ have_overlap.view(-1, 1), overlap_rate.view(-1, 1), subsequence.view(-1, 1), constituent.view(-1, 1)), 1) if self.hans_features and len(self.length_features) != 0: length_features = get_length_features(p_attention_mask, h_attention_mask, self.length_features) hans_h_inputs = torch.cat((hans_h_inputs, length_features), 1) h_logits = self.h_classifier1(hans_h_inputs) h_outputs = (h_logits, ) + h_outputs[2:] if self.hans_only: logits = h_logits # overwrite outputs. outputs = h_outputs elif self.focal_loss or self.poe_loss or self.rubi: h_outputs = self.bert(h_ids, token_type_ids=None, attention_mask=h_attention_mask) pooled_h = h_outputs[1] h_embd = grad_mul_const(pooled_h, 0.0) pooled_h_g = self.dropout(h_embd) h_logits = self.h_classifier1(pooled_h_g) h_outputs = (h_logits, ) + h_outputs[2:] if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: if self.focal_loss: loss_fct = FocalLoss(gamma=self.gamma_focal, \ ensemble_training=self.ensemble_training, aggregate_ensemble=self.aggregate_ensemble) elif self.poe_loss: loss_fct = POELoss( ensemble_training=self.ensemble_training, poe_alpha=self.poe_alpha) elif self.rubi: loss_fct = RUBILoss(num_labels=self.num_labels) elif self.hans_only: if self.weighted_bias_only and self.hans: weights = torch.tensor([0.5, 1.0, 0.5]).cuda() loss_fct = CrossEntropyLoss(weight=weights) else: loss_fct = CrossEntropyLoss() if self.rubi or self.focal_loss or self.poe_loss: if self.ensemble_training: model_loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1), \ h_logits.view(-1, self.num_labels), h_logits_second.view(-1, self.num_labels)) else: model_loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1), \ h_logits.view(-1, self.num_labels)) if self.weighted_bias_only and self.hans: weights = torch.tensor([0.5, 1.0, 0.5]).cuda() h_loss_fct = CrossEntropyLoss(weight=weights) if self.ensemble_training: h_loss_fct_second = CrossEntropyLoss() else: h_loss_fct = CrossEntropyLoss() h_loss = h_loss_fct(h_logits.view(-1, self.num_labels), labels.view(-1)) if self.ensemble_training: h_loss += h_loss_fct_second( h_logits_second.view(-1, self.num_labels), labels.view(-1)) loss = model_loss + self.lambda_h * h_loss else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = (loss, ) + outputs all_outputs = {} all_outputs["bert"] = outputs if self.rubi or self.focal_loss or self.poe_loss: all_outputs["h"] = h_outputs if self.ensemble_training: all_outputs["h_second"] = h_outputs_second return all_outputs # (loss), logits, (hidden_states), (attentions)
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None, ): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Example:: from transformers import AlbertTokenizer, AlbertForMaskedLM import torch tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = AlbertForMaskedLM.from_pretrained('albert-base-v2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ outputs = self.albert( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) sequence_outputs = outputs[0] prediction_scores = self.predictions(sequence_outputs) outputs = (prediction_scores, ) + outputs[ 2:] # Add hidden states and attention if they are here if masked_lm_labels is not None: loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct( prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) outputs = (masked_lm_loss, ) + outputs return outputs
def forward( self, input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, past_key_values=past_key_values, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) loss = None if labels is not None: # Shift so that tokens < n predict n shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) if not return_dict: output = (lm_logits, ) + transformer_outputs[1:] return ((loss, ) + output) if loss is not None else output return CausalLMOutputWithPast( loss=loss, logits=lm_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions, )