def get_optimizers( self, num_training_steps: int ) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]: # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": self.args.weight_decay, }, { "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) #scheduler = get_linear_schedule_with_warmup( # optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps #) if self.args.warmup_steps>0: logger.info("*****Linear warmup over %d warmup_steps *****"%self.args.warmup_steps) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps ) else: logger.info("*****Linear warmup over %.1f%% of training.*****"%(self.args.warmup_proportion*100)) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(self.args.warmup_proportion*num_training_steps), num_training_steps=num_training_steps ) return optimizer, scheduler
def init_schedule(config, optimizer, train_loader): t_total = len(train_loader) * config.epochs warmup_steps = t_total * config.warmup_ratio if switch: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) scheduler = get_linear_schedule_with_warmup( optimizer.optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) else: scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total) return scheduler
def start_train(args): '''data loaders''' train_token_ids, train_att_masks, train_labels = load_tokens(args.train_sent_tokens, args.train_att_tokens, args.train_labels) train_dataset = TensorDataset(train_token_ids, train_att_masks, train_labels.reshape(-1,1)) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4) test_token_ids, test_att_masks, test_labels = load_tokens(args.test_sent_tokens, args.test_att_tokens, args.test_labels) test_dataset = TensorDataset(test_token_ids, test_att_masks, test_labels.reshape(-1,1)) test_loader = DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=4) '''model setup''' model = BertAttractor() optimizer = optim.AdamW(model.parameters(), lr=1e-5) num_train_steps = int(len(train_dataset) / args.batch_size * args.epochs) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_train_steps) '''init model''' if args.init: print(f'Initializing model from {args.init}') checkpoint = torch.load(args.init) model.load_state_dict(checkpoint['model_state_dict']) '''device allocation''' if torch.cuda.is_available(): print('Using gpu') device = torch.device('cuda:1') else: print('Cannot train without GPU') sys.exit() train_acc, train_loss = [], [] test_acc, test_loss = [], [] start_epoch = 0 if args.resume: print(f'Resuming model from {args.resume}') checkpoint = torch.load(args.resume) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) train_acc, train_loss = checkpoint['train_acc'], checkpoint['train_loss'] test_acc, test_loss = checkpoint['test_acc'], checkpoint['test_loss'] for state in optimizer.state.values(): for key, value in state.items(): if isinstance(value, torch.Tensor): state[key] = value.to(device) '''freeze bert model''' # for param in model.parameters(): # param.requires_grad = False # for param in model.bert.classifier.parameters(): # param.requires_grad = True # for param in model.attractor.parameters(): # param.requires_grad = True '''train the model''' train_model(model, train_loader, test_loader, optimizer, device, start_epoch, args.epochs, train_acc, train_loss, test_acc, test_loss, args.out_dir, scheduler)
def fit(model, epochs, train_loader, valid_loader, lr=1e-3, class_weights=None): nbBatch = trainSetlength / batch_size num_training_steps = nbBatch * epochs criterion = nn.CrossEntropyLoss( weight=class_weights) # voir quel loss utilisé pour du multilabel #optimizer = optim.Adam(model.parameters(), lr=lr) #BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps) optimizer = optim.AdamW(model.parameters(), lr=lr) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) for epoch in range(epochs): model.train() total_loss = num = 0 for x, y in train_loader: optimizer.zero_grad() y_scores = model(x) loss = criterion(y_scores, y) loss.backward() optimizer.step() scheduler.step() total_loss += loss.item() num += len(y) print(epoch, total_loss / num, *perf(model, valid_loader))
def get_optimizers( self, num_training_steps: int ) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]: """ Setup the optimizer and the learning rate scheduler. We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the Trainer's init, or override this method in a subclass. """ if self.optimizers is not None: return self.optimizers # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in self.model.named_parameters() if "relational_transformer" not in n and not any(nd in n for nd in no_decay)], "weight_decay": self.args.weight_decay, }, { "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, { "params": [p for n, p in self.model.named_parameters() if "relational_transformer" in n and not any(nd in n for nd in no_decay)], "weight_decay": self.args.weight_decay, "lr": 7e-5 } ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps ) return optimizer, scheduler
def get_optimizer(self, num_training_steps: int): no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.args.weight_decay, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps) return optimizer, scheduler
def get_optimizers( self, num_training_steps: int ) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]: if self.optimizers is not None: return self.optimizers # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [{ "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.args.weight_decay, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, { "params": self.fc.parameters(), "weight_decay": self.args.weight_decay }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps) return optimizer, scheduler
def get_optimizers(self): # Setup the optimizer and the learning rate scheduler. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": cfg.weight_decay, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.lr) num_training_steps = self.reader.set_stats['train']['num_dials'] * \ cfg.epoch_num // (cfg.gradient_accumulation_steps * cfg.batch_size) num_warmup_steps = cfg.warmup_steps if cfg.warmup_steps >= 0 else int( num_training_steps * 0.2) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) return optimizer, scheduler
def get_optimizers(model, learning_rate, adam_epsilon, weight_decay, num_training_steps): # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": weight_decay }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_training_steps) return optimizer, scheduler
def create_scheduler(self, num_training_steps: int): scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps, ) return scheduler
def build_default_model(args): """ 自定义模型 规格要求返回模型(model)、优化器(optimizer)、调度器(scheduler)三元组。 """ # -------- model -------- model = load_pretrained_model(args) model.to(args.device) # -------- optimizer -------- from transformers.optimization import AdamW optimizer_parameters = get_default_optimizer_parameters( model, args.weight_decay) optimizer = AdamW(optimizer_parameters, lr=args.learning_rate, eps=args.adam_epsilon, correct_bias=False) # -------- scheduler -------- from transformers.optimization import get_linear_schedule_with_warmup scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.total_steps * args.warmup_rate, num_training_steps=args.total_steps) return model, optimizer, scheduler
def configure_optimizers(self): no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad ], "weight_decay": self.args.weight_decay, }, { "params": [ p for n, p in self.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.lr, eps=self.args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=self.args.train_steps) return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
def build_optimizer_scheduler(model, num_train_steps, learning_rate): optimizer = AdamW( model.parameters(), lr=learning_rate, correct_bias=False) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=100, num_training_steps=num_train_steps) return optimizer, scheduler
def train(self, epochs): """ Runs the training. """ pretrained_model = self.config.get("pretrained_mtb_model", None) pretrained_model = ("pretrained" if pretrained_model else "no_pretraining") no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.config.get("lr")) ovr_steps = (epochs * len(self.data_loader.train_generator) * self.config.get("mini_batch_size") / self.config.get("batch_size")) scheduler = get_linear_schedule_with_warmup(optimizer, ovr_steps // 10, ovr_steps) results_path = os.path.join("results", "sem_eval", pretrained_model, str(epochs)) best_model_path = os.path.join(self.checkpoint_dir, "best_model.pth.tar") resume = self.config.get("resume", False) if resume and os.path.exists(best_model_path): ( self._start_epoch, self._best_test_f1, self._train_loss, self._train_acc, self._test_f1, self._train_acc, ) = self.load_best_model(self.checkpoint_dir) logger.info("Starting training process") pad_id = self.tokenizer.pad_token_id for epoch in range(self._start_epoch, epochs): self._train_epoch(epoch, pad_id, optimizer, scheduler) data = self._write_kpis(results_path) self._plot_results(data, results_path) logger.info("Finished Training.") return self.model
def create_optimizer_and_scheduler(self, num_training_steps: int): """ Based on Transformers' default one, we add fixing layer option where the bottom n layers' parameters are fixed and only the top layers are further fine-tuned. """ if self.optimizer is None: params = {} for n, p in self.model.named_parameters(): if self.args.fix_layers > 0: if 'encoder.layer' in n: try: layer_num = int(n[n.find('encoder.layer') + 14:].split('.')[0]) except: print(n) raise Exception("") if layer_num >= self.args.fix_layers: print('yes', n) params[n] = p else: print('no ', n) elif 'embeddings' in n: print('no ', n) else: print('yes', n) params[n] = p else: params[n] = p no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in params.items() if not any(nd in n for nd in no_decay) ], "weight_decay": self.args.weight_decay, }, { "params": [ p for n, p in params.items() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] self.optimizer = AdamW( optimizer_grouped_parameters, lr=self.args.learning_rate, betas=(self.args.adam_beta1, self.args.adam_beta2), eps=self.args.adam_epsilon, ) if self.lr_scheduler is None: self.lr_scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps)
def configure_optimizers(self): optimizer = torch.optim.Adam(self.parameters(), lr=1e-5) dataset_size = self.train_dataloader.dataloader.dataset.__len__() num_steps = dataset_size * self.args.epochs / self.args.grad_accum / self.args.batch_size scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_steps * 0.1, num_training_steps=num_steps) return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
def run_train(): data_dir = config.DATA_DIR nerProcessor = NerProcessor() train_example = nerProcessor.get_train_examples(data_dir) label_list = nerProcessor.get_labels() tokenizer = transformers.BertTokenizer.from_pretrained( config.BERT_TOKENIZER_PATH) train_features = convert_examples_to_features(train_example, label_list, config.MAX_SEQ_LEN, tokenizer) # input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) # attention_mask = torch.tensor([f.attention_mask for f in train_features], dtype=torch.long) # token_type_ids = torch.tensor([f.token_type_ids for f in train_features], dtype=torch.long) # label_ids = torch.tensor([f.label_ids for f in train_features], dtype=torch.long) input_ids = torch.tensor([f["input_ids"] for f in train_features], dtype=torch.long) attention_mask = torch.tensor( [f["attention_mask"] for f in train_features], dtype=torch.long) token_type_ids = torch.tensor( [f["token_type_ids"] for f in train_features], dtype=torch.long) label_ids = torch.tensor([f["label_ids"] for f in train_features]) label_ids = F.one_hot(label_ids) label_ids = torch.tensor(label_ids.numpy(), dtype=torch.float) train_dataset = TensorDataset(input_ids, attention_mask, token_type_ids, label_ids) sampler = SequentialSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=sampler, batch_size=config.TRAIN_BATCH_SIZE) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = BertNER(config.BERT_MODEL_PATH, len(label_list) + 1) model.to(device) optimizer = AdamW(model.parameters(), lr=config.LEARNING_RATE) num_training_step = len( train_dataset) // config.TRAIN_BATCH_SIZE * config.TRAIN_EPOCHS scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_training_step) for epoch in range(config.TRAIN_EPOCHS): train_fn(model, device, train_dataloader, optimizer, scheduler) model_to_save = model.module if hasattr(model, "module") else model model_save_path = os.path.join(f"{config.BERT_OUTPUT}/{epoch+1}", WEIGHTS_NAME) torch.save(model_to_save.state_dict(), model_save_path) tokenizer.save_vocabulary(f"{config.BERT_OUTPUT}/{epoch+1}/vocab.txt") model_to_save = model.module if hasattr(model, "module") else model model_save_path = os.path.join(config.BERT_OUTPUT, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), model_save_path) tokenizer.save_vocabulary(f"{config.BERT_OUTPUT}/vocab.txt")
def finetune(features, optimizer, num_epoch, num_steps): best_score = -1 train_dataloader = DataLoader(features, batch_size=args.train_batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True) train_iterator = range(int(num_epoch)) total_steps = int(len(train_dataloader) * num_epoch) warmup_steps = int(total_steps * args.warmup_ratio) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps) print("Total steps: {}".format(total_steps)) print("Warmup steps: {}".format(warmup_steps)) for epoch in train_iterator: for step, batch in enumerate(train_dataloader): num_steps += 1 model.train() inputs = { 'input_ids': batch[0].to(args.device), 'attention_mask': batch[1].to(args.device), 'labels': batch[2], 'entity_pos': batch[3], 'hts': batch[4], 'index': batch[5].to(args.device), } outputs = model(**inputs) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean() with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() wandb.log({"loss": loss.item()}, step=num_steps) if (step + 1) == len(train_dataloader) - 1: dev_score, dev_output = evaluate(args, model, dev_features, tag="dev") wandb.log(dev_output, step=num_steps) print(dev_output) if dev_score > best_score: best_score = dev_score if test_features is not None: pred = test(args, model, test_features) with open("result.json", "w") as fh: json.dump(pred, fh) return num_steps
def get_scheduler(optimizer, len_train_data): batch_size = config_dict["accumulated_batch_size"] epochs = config_dict["epochs"] num_train_steps = int(len_train_data / batch_size) * epochs num_warmup_steps = int(num_train_steps * config_dict["warmup_proportion"]) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_train_steps) return scheduler
def configure_optimizers(self): if (self.trial): lr = self.trial.suggest_loguniform('learning_rate', 2e-5, 5e-5) self.logger.log_hyperparams({'lr': lr}) optimizer = AdamW(self.parameters(), lr=lr, correct_bias=False) else: optimizer = AdamW(self.parameters(), lr=2e-5, correct_bias=False) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=self.total_steps) return [optimizer], [scheduler]
def __init__(self, dataset, batch_size=32): """Creates a new model for sentiment analysis using BERT.""" # The pretrained weights to use. pretrained_weights = 'bert-base-uncased' # Create trainsformer to convert text to indexed tokens. transformer = BertTransform(62, pretrained_weights) # Setup the train loader train_dataset = dataset('./', train=True, transforms=DataToTensor(), vectorizer=transformer, download=True) self.train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False) # Setup the validation loader val_dataset = dataset('./', train=False, transforms=DataToTensor(), vectorizer=transformer, download=True) self.val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) # Retrive the CUDA device if available otherwise use CPU instead self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") device_name = torch.cuda.get_device_name( 0) if torch.cuda.is_available() else "CPU" print("Training on:", device_name) # Loads the pretrained BERT model with classifcation layer self.model = BertForSequenceClassification.from_pretrained( pretrained_weights, num_labels=2) self.model.to(self.device) # Set the learning rate self.lr = 1e-5 # Set the optimizer and scheduler training_steps = len(train_dataset) / batch_size self.optimizer = optim.AdamW(self.model.parameters(), lr=self.lr, correct_bias=False) self.scheduler = optim.get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=0.1, num_training_steps=training_steps) # Maximum gradient norm (used for gradient clipping) self.max_grad_norm = 1.0
def get_linear_schedule_with_warmup_frac(optimizer, num_training_steps, num_warmup_steps, frac_training_steps=0, last_epoch=-1): num_warmup_steps = int(num_training_steps * frac_training_steps) return get_linear_schedule_with_warmup( optimizer, num_training_steps=num_training_steps, num_warmup_steps=num_warmup_steps, last_epoch=last_epoch)
def get_lr_scheduler(self, opt): scheduler = get_linear_schedule_with_warmup( opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.train_steps) scheduler = { "scheduler": scheduler, "interval": "step", "frequency": 1 } return scheduler
def pre_train_bert(self): tokenizer= RobertaTokenizer.from_pretrained('roberta-base') optimizer = optim.Adam(self.bert_model.parameters(), 2e-5) scheduler = get_linear_schedule_with_warmup(optimizer, 31,310) step = 0 train_dataloader = get_bert_lm_dataloader(self.lm_file_path, 64) print("Training LM") if torch.cuda.is_available(): self.bert_model.cuda() for epoch in range(2): print("Epoch : " + str(epoch)) for ind, batch in enumerate(train_dataloader): step += 1 optimizer.zero_grad() if torch.cuda.is_available(): inp = batch[0].cuda() else: inp = batch[0] labels = inp.clone() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, 0.15) special_tokens_mask = [ tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if tokenizer._pad_token is not None: padding_mask = labels.eq(tokenizer.pad_token_id) padding_mask = padding_mask.detach().cpu() probability_matrix.masked_fill_(padding_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -100 # We only compute loss on masked tokens # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices inp[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) # 10% of the time, we replace masked input tokens with random word indices_random = torch.bernoulli( torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long) inp[indices_random] = random_words[indices_random] outputs = self.bert_model(inp, masked_lm_labels=labels.long(),attention_mask=(inp!=tokenizer.pad_token_id).long()) loss, prediction_scores = outputs[:2] loss.backward() #torch.nn.utils.clip_grad_norm_(self.bert_model.parameters(), 1.0) print(str(step) + " Loss is :" + str(loss.item())) optimizer.step() scheduler.step() print("LM training done") torch.save(self.bert_model.state_dict(), "lm_joke_bert.pth")
def fit(self, dataset, validation=True, batch_size=1, patience=3, delta=0.): """ Fits the model to the given dataset. Usage: ``` y >>> rge = Framework(**config) >>> rge.fit(train_data) """ self.model.to(self.device) train_data = dataset.get_train(batch_size) if self.config['half']: self.model, self.optimizer = amp.initialize( self.model, self.optimizer, opt_level='O2', keep_batchnorm_fp32=True) if self.config['linear_scheduler']: num_training_steps = int( len(train_data) // self.grad_acc * self.config['epochs']) scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=self.config.get('warmup_steps', 0), num_training_steps=num_training_steps) else: scheduler = None early_stopping = EarlyStopping(patience, delta, self._save_checkpoint) for epoch in range(self.config['epochs']): self.optimizer.zero_grad() loss = self._train_step(train_data, epoch, scheduler=scheduler) if validation: val_loss, _, _, _ = self._val_step(dataset.get_val(batch_size), epoch) if early_stopping(val_loss, dataset=dataset, epoch=epoch, loss=loss): break # Recover the best epoch path = os.path.join("checkpoints", f"{dataset.name}.pth") config_path = os.path.join("checkpoints", f"{dataset.name}_config.json") _, _ = self._load_checkpoint(path, config_path)
def run_train(): data_dir = config.DATA_DIR kgp = KGProcessor() rela_list = kgp.get_all_relations() examples = kgp.get_train_examples(data_dir) tokenizer = transformers.BertTokenizer.from_pretrained( config.BERT_TOKENIZER_PATH) features = kgp.convert_examples_to_features(examples, config.MAX_SEQ_LEN, tokenizer) input_ids = torch.tensor([f["input_ids"] for f in features], dtype=torch.long) attention_mask = torch.tensor([f["attention_mask"] for f in features], dtype=torch.long) token_type_ids = torch.tensor([f["token_type_ids"] for f in features], dtype=torch.long) labels = torch.tensor([f["label"] for f in features]) labels = F.one_hot(labels) labels = torch.tensor(labels.numpy(), dtype=float) dataset = TensorDataset(input_ids, attention_mask, token_type_ids, labels) sampler = SequentialSampler(dataset) data_loader = DataLoader(dataset, sampler=sampler, batch_size=config.TRAIN_BATCH_SIZE) num_training_steps = len( input_ids) / config.TRAIN_BATCH_SIZE * config.TRAIN_EPOCHS device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = BertKG(config.BERT_MODEL_PATH, len(rela_list)) model.to(device) optimizer = AdamW(model.parameters(), lr=config.LEARNING_RATE) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=num_training_steps) for epoch in range(config.TRAIN_EPOCHS): print( f"\n---------------------------epoch: {epoch+1}---------------------------" ) train_fn(model, device, data_loader, optimizer, scheduler) model_to_save = model.module if hasattr(model, "module") else model output_path = os.path.join(f"{config.BERT_OUTPUT_PATH}/{epoch+1}", WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_path) tokenizer.save_vocabulary( f"{config.BERT_OUTPUT_PATH}/{epoch+1}/vocab.txt") model_to_save = model.module if hasattr(model, "module") else model output_path = os.path.join(f"{config.BERT_OUTPUT_PATH}", WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_path) tokenizer.save_vocabulary(f"{config.BERT_OUTPUT_PATH}/vocab.txt")
def configure_optimizers(self): if self.args.adafactor: optimizer = Adafactor(self.model.parameters(), lr=self.args.lr, scale_parameter=False, relative_step=False) else: optimizer = torch.optim.Adam(self.model.parameters(), lr=self.args.lr) if self.args.debug: return optimizer # const LR num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1 num_steps = self.args.dataset_size * self.args.epochs / num_gpus / self.args.grad_accum / self.args.batch_size scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup, num_training_steps=num_steps ) return [optimizer], [{"scheduler": scheduler, "interval": "step"}]
def _get_scheduler(self): """Get scheduler for different models. Returns: scheduler """ if self.config.model_type == 'bert': scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=self.config.num_warmup_steps, num_training_steps=self.config.num_training_steps) else: # rnn scheduler = get_constant_schedule(self.optimizer) return scheduler
def train_dataloader(self): train_batch_size = self.hparams.train_batch_size dataloader = self.load_dataset("train", train_batch_size) t_total = ( (len(dataloader.dataset) // (train_batch_size * max(1, self.hparams.n_gpu))) // self.hparams.gradient_accumulation_steps * float(self.hparams.num_train_epochs) ) scheduler = get_linear_schedule_with_warmup( self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total ) self.lr_scheduler = scheduler return dataloader
def train(output_dim, num_layers, embedding_dim, hidden_size, model_path, model_type, dropout, criterion, lr, epochs, sequence_length, num_channels, kernels, nhead, warmup_epochs, batch_size, device): model_types = ('Transformer', 'CNN', 'LSTM') assert model_type in model_types, f'model_type must be one of {", ".join(model_types)}' song_loader = get_song_loader(batch_size, sequence_length) if model_type == 'Transformer': model = Classical_Music_Transformer(embedding_dim, hidden_size, output_dim, num_layers, dropout, device, nhead, sequence_length).to(device) elif model_type == 'CNN': model = Classical_Music_CNN(embedding_dim, output_dim, num_channels, kernels, dropout, device, sequence_length).to(device) elif model_type == 'LSTM': model = Classical_Music_LSTM(embedding_dim, hidden_size, output_dim, num_layers, dropout, device, sequence_length).to(device) model.train() optimizer = torch.optim.AdamW(model.parameters(), lr) num_training_steps = len(song_loader) * epochs num_warmup_steps = len(song_loader) * warmup_epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) for epoch in tqdm(range(1, epochs + 1), total=epochs): batch_losses = [] start = time.time() for inputs, targets in song_loader: inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() if model_type == 'Transformer': output, mu, logvar = model(inputs, targets) loss = cross_entropy_and_KL(output, targets, mu, logvar) else: outputs = model(inputs) loss = criterion(outputs, targets) batch_losses.append(loss.item()) loss.backward() optimizer.step() scheduler.step() print(f'Epoch {epoch}/{epochs},\tLoss {np.mean(batch_losses)}\ ,\tDuration {time.time()-start}') torch.save(model, model_path) torch.save(model.to('cpu'), model_path)