def __init__( self, model: PreTrainedModel, args: TrainingArguments, data_collator: Optional[DataCollator] = None, train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Dataset] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, prediction_loss_only=False, tb_writer: Optional["SummaryWriter"] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None, ): """ Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for Transformers. Args: prediction_loss_only: (Optional) in evaluation and prediction, only return the loss """ self.model = model.to(args.device) self.args = args self.data_collator = data_collator if data_collator is not None else default_data_collator self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.compute_metrics = compute_metrics self.prediction_loss_only = prediction_loss_only self.optimizers = optimizers if tb_writer is not None: self.tb_writer = tb_writer elif is_tensorboard_available() and self.is_world_master(): self.tb_writer = SummaryWriter(log_dir=self.args.logging_dir) if not is_tensorboard_available(): logger.warning( "You are instantiating a Trainer but Tensorboard is not installed. You should consider installing it." ) if is_wandb_available(): self._setup_wandb() else: logger.info( "You are instantiating a Trainer but W&B is not installed. To use wandb logging, " "run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface." ) set_seed(self.args.seed) # Create output directory if needed if self.is_world_master(): os.makedirs(self.args.output_dir, exist_ok=True) if is_torch_tpu_available(): # Set an xla_device flag on the model's config. # We'll find a more elegant and not need to do this in the future. self.model.config.xla_device = True if not callable(self.data_collator) and callable(getattr(self.data_collator, "collate_batch", None)): self.data_collator = self.data_collator.collate_batch warnings.warn( ( "The `data_collator` should now be a simple callable (function, class with `__call__`), classes " + "with a `collate_batch` are deprecated and won't be supported in a future version." ), FutureWarning, )
def from_huggingface_model(cls, model: PreTrainedModel, ffn_activation: str, ffn_dropout: float, attention: Attention): config = model.config encoder = cls(n_layers=config.n_layers, n_heads=config.n_heads, dim=config.dim, hidden_dim=config.hidden_dim, ffn_activation=ffn_activation, ffn_dropout=ffn_dropout, attention=attention) # After creating the encoder, we copy weights over from the transformer. This currently # requires that the internal structure of the text side of this encoder *exactly matches* # the internal structure of whatever transformer you're using. encoder_parameters = dict(encoder.named_parameters()) for name, parameter in model.named_parameters(): if name.startswith("transformer."): name = name.replace("LayerNorm", "layer_norm") if name not in encoder_parameters: raise ValueError( f"Couldn't find a matching parameter for {name}. Is this transformer " "compatible with the joint encoder you're using?") encoder_parameters[name].data.copy_(parameter.data) return encoder
def __init__( self, model: PreTrainedModel, args: TrainingArguments, data_collator: Optional[DataCollator] = None, train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Dataset] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, prediction_loss_only=False, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = None, ): self.model = model.to(args.device) self.fc = nn.Linear(768, 1839).to(args.device) self.args = args self.data_collator = data_collator if data_collator is not None else default_data_collator self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.compute_metrics = compute_metrics self.prediction_loss_only = prediction_loss_only self.optimizers = optimizers set_seed(self.args.seed)
def convert_weights_and_push(save_directory: Path, model_name: str = None, push_to_hub: bool = True): filename = "imagenet-1k-id2label.json" num_labels = 1000 repo_id = "datasets/huggingface/label-files" num_labels = num_labels id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename)), "r")) id2label = {int(k): v for k, v in id2label.items()} id2label = id2label label2id = {v: k for k, v in id2label.items()} ImageNetPreTrainedConfig = partial(RegNetConfig, num_labels=num_labels, id2label=id2label, label2id=label2id) names_to_config = { "regnet-y-10b-seer": ImageNetPreTrainedConfig( depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010 ), # finetuned on imagenet "regnet-y-10b-seer-in1k": ImageNetPreTrainedConfig( depths=[2, 7, 17, 1], hidden_sizes=[2020, 4040, 11110, 28280], groups_width=1010 ), } # add seer weights logic def load_using_classy_vision(checkpoint_url: str) -> Tuple[Dict, Dict]: files = torch.hub.load_state_dict_from_url(checkpoint_url, model_dir=str(save_directory), map_location="cpu") # check if we have a head, if yes add it model_state_dict = files["classy_state_dict"]["base_model"]["model"] return model_state_dict["trunk"], model_state_dict["heads"] names_to_from_model = { "regnet-y-10b-seer": partial( load_using_classy_vision, "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_regnet10B/model_iteration124500_conso.torch", ), "regnet-y-10b-seer-in1k": partial( load_using_classy_vision, "https://dl.fbaipublicfiles.com/vissl/model_zoo/seer_finetuned/seer_10b_finetuned_in1k_model_phase28_conso.torch", ), } from_to_ours_keys = get_from_to_our_keys(model_name) if not (save_directory / f"{model_name}.pth").exists(): logger.info("Loading original state_dict.") from_state_dict_trunk, from_state_dict_head = names_to_from_model[model_name]() from_state_dict = from_state_dict_trunk if "in1k" in model_name: # add the head from_state_dict = {**from_state_dict_trunk, **from_state_dict_head} logger.info("Done!") converted_state_dict = {} not_used_keys = list(from_state_dict.keys()) regex = r"\.block.-part." # this is "interesting", so the original checkpoints have `block[0,1]-part` in each key name, we remove it for key in from_state_dict.keys(): # remove the weird "block[0,1]-part" from the key src_key = re.sub(regex, "", key) # now src_key from the model checkpoints is the one we got from the original model after tracing, so use it to get the correct destination key dest_key = from_to_ours_keys[src_key] # store the parameter with our key converted_state_dict[dest_key] = from_state_dict[key] not_used_keys.remove(key) # check that all keys have been updated assert len(not_used_keys) == 0, f"Some keys where not used {','.join(not_used_keys)}" logger.info(f"The following keys were not used: {','.join(not_used_keys)}") # save our state dict to disk torch.save(converted_state_dict, save_directory / f"{model_name}.pth") del converted_state_dict else: logger.info("The state_dict was already stored on disk.") if push_to_hub: logger.info(f"Token is {os.environ['HF_TOKEN']}") logger.info("Loading our model.") # create our model our_config = names_to_config[model_name] our_model_func = RegNetModel if "in1k" in model_name: our_model_func = RegNetForImageClassification our_model = our_model_func(our_config) # place our model to the meta device (so remove all the weights) our_model.to(torch.device("meta")) logger.info("Loading state_dict in our model.") # load state dict state_dict_keys = our_model.state_dict().keys() PreTrainedModel._load_pretrained_model_low_mem( our_model, state_dict_keys, [save_directory / f"{model_name}.pth"] ) logger.info("Finally, pushing!") # push it to hub our_model.push_to_hub( repo_path_or_name=save_directory / model_name, commit_message="Add model", output_dir=save_directory / model_name, ) size = 384 # we can use the convnext one feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/convnext-base-224-22k-1k", size=size) feature_extractor.push_to_hub( repo_path_or_name=save_directory / model_name, commit_message="Add feature extractor", output_dir=save_directory / model_name, )
def train(model: PreTrainedModel, train_dataloader: DataLoader, dev_dataloader: DataLoader, batch_size: int, gradient_accumulation_steps: int, device, num_train_epochs: int = 20, warmup_proportion: float = 0.1, learning_rate: float = 1e-5, patience: int = 5, output_dir: str = "/tmp/", model_file_name: str = "model.bin") -> str: """ Trains a BERT Model on a set of training data, tuning it on a set of development data Args: model: the model that will be trained train_dataloader: a DataLoader with training data dev_dataloader: a DataLoader with development data (for early stopping) batch_size: the batch size for training gradient_accumulation_steps: the number of steps that gradients will be accumulated device: the device where training will take place ("cpu" or "cuda") num_train_epochs: the maximum number of training epochs warmup_proportion: the proportion of training steps for which the learning rate will be warmed up learning_rate: the initial learning rate patience: the number of epochs after which training will stop if no improvement on the dev set is observed output_dir: the directory where the model will be saved model_file_name: the filename of the model file Returns: the path to the trained model """ def warmup_linear(x, warmup=0.002): if x < warmup: return x / warmup return 1.0 - x output_model_file = os.path.join(output_dir, model_file_name) num_train_steps = int( len(train_dataloader.dataset) / batch_size / gradient_accumulation_steps * num_train_epochs) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, correct_bias=False) global_step = 0 loss_history = [] best_epoch = 0 for epoch in trange(int(num_train_epochs), desc="Epoch"): model.train() tr_loss = 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Training iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch if type(model) == BertForSequenceClassification or type( model) == BertForMultiLabelSequenceClassification: outputs = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids) elif type(model) == DistilBertForSequenceClassification: outputs = model(input_ids, attention_mask=input_mask, labels=label_ids) loss = outputs[0] if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % gradient_accumulation_steps == 0: lr_this_step = learning_rate * warmup_linear( global_step / num_train_steps, warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 dev_loss, _, _ = evaluate(model, dev_dataloader, device) print("Loss history:", loss_history) print("Dev loss:", dev_loss) if len(loss_history) == 0 or dev_loss < min(loss_history): model_to_save = model.module if hasattr(model, 'module') else model torch.save(model_to_save.state_dict(), output_model_file) best_epoch = epoch if epoch - best_epoch >= patience: print("No improvement on development set. Finish training.") break loss_history.append(dev_loss) return output_model_file
def evaluate(model: PreTrainedModel, dataloader: DataLoader, device: str) -> (int, List[int], List[int]): """ Evaluates a Bert Model on a labelled data set. Args: model: the BertModel to be evaluated dataloader: the DataLoader with the test data device: the device where evaluation will take place ("cpu" or "cuda") Returns: a tuple with (the evaluation loss, a list with the correct labels, and a list with the predicted labels) """ model.eval() eval_loss = 0 nb_eval_steps = 0 predicted_labels, correct_labels = [], [] for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch with torch.no_grad(): if type(model) == BertForSequenceClassification or type( model) == BertForMultiLabelSequenceClassification: tmp_eval_loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids) elif type(model) == DistilBertForSequenceClassification: tmp_eval_loss, logits = model(input_ids, attention_mask=input_mask, labels=label_ids) if type(model) == BertForSequenceClassification or type( model) == DistilBertForSequenceClassification: outputs = np.argmax(logits.to('cpu'), axis=1) label_ids = label_ids.to('cpu').numpy() predicted_labels += list(outputs) elif type(model) == BertForMultiLabelSequenceClassification: sig = Sigmoid() outputs = sig(logits).to('cpu').numpy() label_ids = label_ids.to('cpu').numpy() predicted_labels += list(outputs >= 0.5) correct_labels += list(label_ids) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps correct_labels = np.array(correct_labels) predicted_labels = np.array(predicted_labels) return eval_loss, correct_labels, predicted_labels