def get_constant_schedule_with_warmup(optimizer, epochs, batch_size, n_samples): warmup_proportion = 0.3 n_steps = int(np.ceil(n_samples / batch_size)) num_training_steps = n_steps * epochs num_warmup_steps = int(warmup_proportion * num_training_steps) sch = optimization.get_constant_schedule_with_warmup(optimizer, num_warmup_steps) return sch
def init_fn(optimizer, epochs, batch_size, n_samples): n_steps = int(np.ceil(n_samples / batch_size)) num_training_steps = n_steps * epochs num_warmup_steps = int(warmup_proportion * num_training_steps) sch = optimization.get_constant_schedule_with_warmup( optimizer, num_warmup_steps) update_in_batch, update_in_epoch = True, False return sch, update_in_batch, update_in_epoch
def __init__(self, optimizer: Optimizer, num_warmup_steps: int, last_epoch: int = -1) -> None: lr_scheduler = get_constant_schedule_with_warmup( optimizer=optimizer, num_warmup_steps=num_warmup_steps, last_epoch=last_epoch) super().__init__(lr_scheduler)
def configure_optimizers(self): optimizer = AdamW(params=self.parameters(), lr=self.hparams['learning_rate']) warmup_steps = self.hparams['warmup_steps'] scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}]
def _create_lr_scheduler(self) -> Dict: """Returns one of three default schedulers Possibilities: constant/linear/cosine schedule with or without warmup """ steps_per_epoch = math.ceil( len(self._train_instances) / self._trainer_config.batch_size ) try: training_steps = min( self._trainer_config.max_steps, self._trainer_config.max_epochs * steps_per_epoch, ) # One or both of the max_* is None: except TypeError: training_steps = ( self._trainer_config.max_steps # 1000 is the default of the lightning trainer or (self._trainer_config.max_epochs or 1000) * steps_per_epoch ) if self._trainer_config.lr_decay == "linear": scheduler = get_linear_schedule_with_warmup( optimizer=self._pipeline.model.optimizer, num_warmup_steps=self._trainer_config.warmup_steps, num_training_steps=training_steps, ) elif self._trainer_config.lr_decay == "cosine": scheduler = get_cosine_schedule_with_warmup( optimizer=self._pipeline.model.optimizer, num_warmup_steps=self._trainer_config.warmup_steps, num_training_steps=training_steps, ) else: scheduler = get_constant_schedule_with_warmup( optimizer=self._pipeline.model.optimizer, num_warmup_steps=self._trainer_config.warmup_steps, ) return { "scheduler": scheduler, "interval": "step", "name": "learning_rate", }
def train(args): # torch.multiprocessing.set_sharing_strategy('file_system') # too many barriers / one node data parallel and multiple node DDP os.environ['MASTER_ADDR'] = args["master_addr"] os.environ['MASTER_PORT'] = args["master_port"] os.environ['TOKENIZERS_PARALLELISM'] = "true" torch.backends.cudnn.benchmark = True rank = args["nr"] gpus = args["gpus_per_node"] if args["cpu"]: assert args["world_size"] == 1 device = torch.device("cpu") barrier = get_barrier(False) else: dist.init_process_group(args["dist_backend"], rank=rank, world_size=args["world_size"]) device = torch.device('cuda:0') # Unique only on individual node. torch.cuda.set_device(device) barrier = get_barrier(True) set_seeds(args["seed"]) mconf = model_config.to_dict() config = dict(md_config=md_config, sm_config=sm_config)[mconf.pop("model_size")] tokenizer = get_tokenizer(mconf.pop("tokenizer_name")) config.vocab_size = len(tokenizer) + 22 config.tokenizer_length = 1024 config.tokenizer_length = config.tokenizer_length - config.num_highway_cls_tokens config.max_position_embeddings = config.max_position_embeddings + config.num_highway_cls_tokens collate_fn = get_collate_fn(config.num_highway_cls_tokens, tokenizer.pad_token_id) model = FastFormerForFusedELECTRAPretraining(config, tokenizer=tokenizer, **mconf).to(device) print("Trainable Params = %s" % (numel(model) / 1_000_000)) if args["pretrained_model"] is not None: model.load_state_dict(torch.load(args["pretrained_model"], map_location={'cuda:%d' % 0: 'cuda:%d' % 0})) model.data_parallel = True # Take model to local rank if args["cpu"]: ddp_model = model else: if torch.cuda.device_count() > 1: model = nn.DataParallel(model) ddp_model = DDP(model, device_ids=[0], find_unused_parameters=True) all_params = list(filter(lambda p: p.requires_grad, ddp_model.parameters())) optc = optimizer_config.to_dict() optimizer = AdamW(all_params, lr=optc["lr"], eps=optc["eps"], weight_decay=optc["weight_decay"], betas=(optc["beta_1"], optc["beta_2"])) optimizer.zero_grad() scaler = GradScaler() model_save_dir = args["model_save_dir"] model_save_name = args["model_save_name"] if rank == 0: if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) assert os.path.exists(model_save_dir) barrier() print("Optimizer Created for Rank = %s" % rank) shuffle_dataset = args["shuffle_dataset"] sampling_fraction = optc["sampling_fraction"] if not args["validate_only"] and not args["test_only"]: train_loader = build_dataloader(args["train_dataset"], shuffle_dataset, sampling_fraction, config, collate_fn, tokenizer, world_size=args["world_size"], num_workers=args["num_workers"]) print("Data Loaded for Rank = %s" % rank) validate_every_steps = args["validate_every_steps"] log_every_steps = args["log_every_steps"] save_every_steps = args["save_every_steps"] scheduler = optimization.get_constant_schedule_with_warmup(optimizer, optc["warmup_steps"]) gradient_clipping = optc["gradient_clipping"] _ = model.train() barrier() start_time = time.time() batch_times = [] model_times = [] full_times = [] print("Start Training for Rank = %s" % rank) for step, batch in enumerate(train_loader): model.zero_grad() optimizer.zero_grad() if step == 0: print("First Batch Training for Rank = %s" % rank) # if step <= 39: # continue gen_batch_time = time.time() - start_time batch_times.append(gen_batch_time) if (step + 1) % save_every_steps == 0: if rank == 0: torch.save(ddp_model.state_dict(), os.path.join(model_save_dir, model_save_name)) barrier() if (step + 1) % validate_every_steps == 0: if rank == 0: val_results = LargeValidator(args["validation_dataset"], ddp_model, config, device, tokenizer)() print("Rank = %s, steps = %s, Val = %s" % (rank, step, val_results)) barrier() record_accuracy = False if (step + 1) % log_every_steps == 0: record_accuracy = True batch["record_accuracy"] = record_accuracy labels = batch["label_mlm_input_ids"] if "label_mlm_input_ids" in batch else batch["input_ids"] labels = labels.to(device) model_start_time = time.time() if args["cpu"]: output = ddp_model(**batch, labels=labels) output = {key: [item[key] for item in output] for key in list(functools.reduce( lambda x, y: x.union(y), (set(dicts.keys()) for dicts in output) )) } output = {k: torch.mean(v) for k, v in output.items()} loss = output["loss"] loss_dict = output["loss_dict"] loss.backward() torch.nn.utils.clip_grad_norm_(all_params, gradient_clipping) optimizer.step() scheduler.step() optimizer.zero_grad() else: with autocast(): output = ddp_model(**batch, labels=labels) output = {key: [item[key] for item in output] for key in list(functools.reduce( lambda x, y: x.union(y), (set(dicts.keys()) for dicts in output) )) } output = {k: torch.mean(v) for k, v in output.items()} loss = output["loss"] loss_dict = output["loss_dict"] scaler.scale(loss).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(all_params, gradient_clipping) scaler.step(optimizer) scaler.update() scheduler.step() optimizer.zero_grad() model_end_time = time.time() - model_start_time model_times.append(model_end_time) full_time = time.time() - start_time full_times.append(full_time) start_time = time.time() if (step + 1) % log_every_steps == 0: print("Rank = %s, steps = %s, batch_size = %s, Loss = %s, Accuracy = %s" % (rank, step, batch["input_ids"].size(), loss_dict, output["accuracy_hist"])) print("Batch time = %s, Model Time = %s, Full time = %s" % (np.mean(batch_times), np.mean(model_times), np.mean(full_times))) batch_times = [] model_times = [] full_times = [] clean_memory() barrier() # Take inputs to local_rank # TODO: validate on multigpu, sort the val datasets alphabetically and let the gpu with rank == dataset rank in sort pick up the dataset. GPUs with rank > len(datasetDict) stay idle. # TODO: select one dataset and make full batch from it, this way rebalancing can be easy. # TODO: dataset rebalancing. # TODO: save model only in local_rank == 0 process # TODO: Check if all initialised model weights are same?? # I've been tracking an ema of sample training loss during training and using that to guide weighted data sampling (rather than the typical uniform sampling). Seems to help with a variety of real world datasets where the bulk of the data is often very similar and easy to learn but certain subpopulations are much more challenging. pass
def main(args): local_config = json.load(open(args.local_config_path)) local_config['loss'] = args.loss local_config['data_dir'] = args.data_dir local_config['train_batch_size'] = args.train_batch_size local_config[ 'gradient_accumulation_steps'] = args.gradient_accumulation_steps local_config['lr_scheduler'] = args.lr_scheduler local_config['model_name'] = args.model_name local_config['pool_type'] = args.pool_type local_config['seed'] = args.seed local_config['do_train'] = args.do_train local_config['do_validation'] = args.do_validation local_config['do_eval'] = args.do_eval local_config['use_cuda'] = args.use_cuda.lower() == 'true' local_config['num_train_epochs'] = args.num_train_epochs local_config['eval_batch_size'] = args.eval_batch_size local_config['max_seq_len'] = args.max_seq_len local_config['syns'] = ["Target", "Synonym"] local_config['target_embeddings'] = args.target_embeddings local_config['symmetric'] = args.symmetric.lower() == 'true' local_config['mask_syns'] = args.mask_syns local_config['train_scd'] = args.train_scd local_config['ckpt_path'] = args.ckpt_path local_config['head_batchnorm'] = args.head_batchnorm local_config['head_hidden_size'] = args.head_hidden_size local_config['linear_head'] = args.linear_head.lower() == 'true' local_config['emb_size_for_cosine'] = args.emb_size_for_cosine local_config['add_fc_layer'] = args.add_fc_layer if local_config['do_train'] and os.path.exists(args.output_dir): from glob import glob model_weights = glob(os.path.join(args.output_dir, '*.bin')) if model_weights: print(f'{model_weights}: already computed: skipping ...') return else: print( f'already existing {args.output_dir}. but without model weights ...' ) return device = torch.device("cuda" if local_config['use_cuda'] else "cpu") n_gpu = torch.cuda.device_count() if local_config['gradient_accumulation_steps'] < 1: raise ValueError( "gradient_accumulation_steps parameter should be >= 1") local_config['train_batch_size'] = \ local_config['train_batch_size'] // local_config['gradient_accumulation_steps'] if local_config['do_train']: random.seed(local_config['seed']) np.random.seed(local_config['seed']) torch.manual_seed(local_config['seed']) if n_gpu > 0: torch.cuda.manual_seed_all(local_config['seed']) if not local_config['do_train'] and not local_config['do_eval']: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if local_config['do_train'] and not os.path.exists(args.output_dir): os.makedirs(args.output_dir) os.makedirs(os.path.join(args.output_dir, 'nen-nen-weights')) elif local_config['do_train'] or local_config['do_validation']: raise ValueError(args.output_dir, 'output_dir already exists') suffix = datetime.now().isoformat().replace('-', '_').replace( ':', '_').split('.')[0].replace('T', '-') if local_config['do_train']: train_writer = SummaryWriter(log_dir=os.path.join( args.output_dir, f'tensorboard-{suffix}', 'train')) dev_writer = SummaryWriter(log_dir=os.path.join( args.output_dir, f'tensorboard-{suffix}', 'dev')) logger.addHandler( logging.FileHandler( os.path.join(args.output_dir, f"train_{suffix}.log"), 'w')) eval_logger.addHandler( logging.FileHandler( os.path.join(args.output_dir, f"scores_{suffix}.log"), 'w')) else: logger.addHandler( logging.FileHandler( os.path.join(args.ckpt_path, f"eval_{suffix}.log"), 'w')) logger.info(args) logger.info(json.dumps(vars(args), indent=4)) if args.do_train: json.dump( local_config, open(os.path.join(args.output_dir, 'local_config.json'), 'w')) json.dump(vars(args), open(os.path.join(args.output_dir, 'args.json'), 'w')) logger.info("device: {}, n_gpu: {}".format(device, n_gpu)) with open(os.path.join(args.output_dir, 'local_config.json'), 'w') as outp: json.dump(local_config, outp, indent=4) with open(os.path.join(args.output_dir, 'args.json'), 'w') as outp: json.dump(vars(args), outp, indent=4) syns = sorted(local_config['syns']) id2classifier = {i: classifier for i, classifier in enumerate(syns)} model_name = local_config['model_name'] data_processor = DataProcessor() train_dir = os.path.join(local_config['data_dir'], 'train/') dev_dir = os.path.join(local_config['data_dir'], 'dev') if local_config['do_train']: config = configs[local_config['model_name']] config = config.from_pretrained(local_config['model_name'], hidden_dropout_prob=args.dropout) if args.ckpt_path != '': model_path = args.ckpt_path else: model_path = local_config['model_name'] model = models[model_name].from_pretrained( model_path, cache_dir=str(PYTORCH_PRETRAINED_BERT_CACHE), local_config=local_config, data_processor=data_processor, config=config) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ param for name, param in param_optimizer if not any(nd in name for nd in no_decay) ], 'weight_decay': float(args.weight_decay) }, { 'params': [ param for name, param in param_optimizer if any(nd in name for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=float(args.learning_rate), eps=1e-6, betas=(0.9, 0.98), correct_bias=True) train_features = model.convert_dataset_to_features(train_dir, logger) if args.train_mode == 'sorted' or args.train_mode == 'random_sorted': train_features = sorted(train_features, key=lambda f: np.sum(f.input_mask)) else: random.shuffle(train_features) # import pdb; pdb.set_trace() train_dataloader = \ get_dataloader_and_tensors(train_features, local_config['train_batch_size']) train_batches = [batch for batch in train_dataloader] num_train_optimization_steps = \ len(train_batches) // local_config['gradient_accumulation_steps'] * \ local_config['num_train_epochs'] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) if local_config['lr_scheduler'] == 'linear_warmup': scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps) elif local_config['lr_scheduler'] == 'constant_warmup': scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) logger.info("***** Training *****") logger.info(" Num examples = %d", len(train_features)) logger.info(" Batch size = %d", local_config['train_batch_size']) logger.info(" Num steps = %d", num_train_optimization_steps) if local_config['do_validation']: dev_features = model.convert_dataset_to_features(dev_dir, logger) logger.info("***** Dev *****") logger.info(" Num examples = %d", len(dev_features)) logger.info(" Batch size = %d", local_config['eval_batch_size']) dev_dataloader = \ get_dataloader_and_tensors(dev_features, local_config['eval_batch_size']) test_dir = os.path.join(local_config['data_dir'], 'test/') if os.path.exists(test_dir): test_features = model.convert_dataset_to_features( test_dir, test_logger) logger.info("***** Test *****") logger.info(" Num examples = %d", len(test_features)) logger.info(" Batch size = %d", local_config['eval_batch_size']) test_dataloader = \ get_dataloader_and_tensors(test_features, local_config['eval_batch_size']) best_result = defaultdict(float) eval_step = max(1, len(train_batches) // args.eval_per_epoch) start_time = time.time() global_step = 0 model.to(device) lr = float(args.learning_rate) for epoch in range(1, 1 + local_config['num_train_epochs']): tr_loss = 0 nb_tr_examples = 0 nb_tr_steps = 0 cur_train_loss = defaultdict(float) model.train() logger.info("Start epoch #{} (lr = {})...".format( epoch, scheduler.get_lr()[0])) if args.train_mode == 'random' or args.train_mode == 'random_sorted': random.shuffle(train_batches) train_bar = tqdm(train_batches, total=len(train_batches), desc='training ... ') for step, batch in enumerate(train_bar): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, token_type_ids, \ syn_labels, positions = batch train_loss, _ = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, input_labels={ 'syn_labels': syn_labels, 'positions': positions }) loss = train_loss['total'].mean().item() for key in train_loss: cur_train_loss[key] += train_loss[key].mean().item() train_bar.set_description( f'training... [epoch == {epoch} / {local_config["num_train_epochs"]}, loss == {loss}]' ) loss_to_optimize = train_loss['total'] if local_config['gradient_accumulation_steps'] > 1: loss_to_optimize = \ loss_to_optimize / local_config['gradient_accumulation_steps'] loss_to_optimize.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss_to_optimize.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % local_config['gradient_accumulation_steps'] == 0: optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 if local_config['do_validation'] and (step + 1) % eval_step == 0: logger.info( 'Ep: {}, Stp: {}/{}, usd_t={:.2f}s, loss={:.6f}'. format(epoch, step + 1, len(train_batches), time.time() - start_time, tr_loss / nb_tr_steps)) cur_train_mean_loss = {} for key, value in cur_train_loss.items(): cur_train_mean_loss[f'train_{key}_loss'] = \ value / nb_tr_steps dev_predictions = os.path.join(args.output_dir, 'dev_predictions') metrics = predict(model, dev_dataloader, dev_predictions, dev_features, args, cur_train_mean_loss=cur_train_mean_loss, logger=eval_logger) metrics['global_step'] = global_step metrics['epoch'] = epoch metrics['learning_rate'] = scheduler.get_lr()[0] metrics['batch_size'] = \ local_config['train_batch_size'] * local_config['gradient_accumulation_steps'] for key, value in metrics.items(): dev_writer.add_scalar(key, value, global_step) scores_to_logger = tuple([ round(metrics[save_by_score] * 100.0, 2) for save_by_score in args.save_by_score.split('+') ]) logger.info( f"dev %s (lr=%s, epoch=%d): %s" % (args.save_by_score, str( scheduler.get_lr()[0]), epoch, scores_to_logger)) predict_parts = [ part for part in metrics if part.endswith('.score') and metrics[part] > args.start_save_threshold and metrics[part] > best_result[part] ] if len(predict_parts) > 0: best_dev_predictions = os.path.join( args.output_dir, 'best_dev_predictions') dev_predictions = os.path.join(args.output_dir, 'dev_predictions') os.makedirs(best_dev_predictions, exist_ok=True) for part in predict_parts: logger.info( "!!! Best dev %s (lr=%s, epoch=%d): %.2f -> %.2f" % (part, str(scheduler.get_lr()[0]), epoch, best_result[part] * 100.0, metrics[part] * 100.0)) best_result[part] = metrics[part] if [ save_weight for save_weight in args.save_by_score.split('+') if save_weight == part ]: os.makedirs(os.path.join( args.output_dir, part), exist_ok=True) output_model_file = os.path.join( args.output_dir, part, WEIGHTS_NAME) save_model(args, model, output_model_file, metrics) if 'nen-nen' not in part: os.system( f'cp {dev_predictions}/{".".join(part.split(".")[1:-1])}* {best_dev_predictions}/' ) else: output_model_file = os.path.join( args.output_dir, 'nen-nen-weights', WEIGHTS_NAME) save_model(args, model, output_model_file, metrics) # dev_predictions = os.path.join(args.output_dir, 'dev_predictions') # predict( # model, dev_dataloader, dev_predictions, # dev_features, args, only_parts='+'.join(predict_parts) # ) # best_dev_predictions = os.path.join(args.output_dir, 'best_dev_predictions') # os.makedirs(best_dev_predictions, exist_ok=True) # os.system(f'mv {dev_predictions}/* {best_dev_predictions}/') if 'scd' not in '+'.join( predict_parts) and os.path.exists(test_dir): test_predictions = os.path.join( args.output_dir, 'test_predictions') test_metrics = predict( model, test_dataloader, test_predictions, test_features, args, only_parts='+'.join([ 'test' + part[3:] for part in predict_parts if 'nen-nen' not in part ])) best_test_predictions = os.path.join( args.output_dir, 'best_test_predictions') os.makedirs(best_test_predictions, exist_ok=True) os.system( f'mv {test_predictions}/* {best_test_predictions}/' ) for key, value in test_metrics.items(): if key.endswith('score'): dev_writer.add_scalar( key, value, global_step) if args.log_train_metrics: metrics = predict(model, train_dataloader, os.path.join(args.output_dir, 'train_predictions'), train_features, args, logger=logger) metrics['global_step'] = global_step metrics['epoch'] = epoch metrics['learning_rate'] = scheduler.get_lr()[0] metrics['batch_size'] = \ local_config['train_batch_size'] * local_config['gradient_accumulation_steps'] for key, value in metrics.items(): train_writer.add_scalar(key, value, global_step) if local_config['do_eval']: assert args.ckpt_path != '', 'in do_eval mode ckpt_path should be specified' test_dir = args.eval_input_dir config = configs[model_name].from_pretrained(model_name) model = models[model_name].from_pretrained( args.ckpt_path, local_config=local_config, data_processor=data_processor, config=config) model.to(device) test_features = model.convert_dataset_to_features( test_dir, test_logger) logger.info("***** Test *****") logger.info(" Num examples = %d", len(test_features)) logger.info(" Batch size = %d", local_config['eval_batch_size']) test_dataloader = \ get_dataloader_and_tensors(test_features, local_config['eval_batch_size']) metrics = predict(model, test_dataloader, os.path.join(args.output_dir, args.eval_output_dir), test_features, args, compute_metrics=True) print(metrics) with open( os.path.join(args.output_dir, args.eval_output_dir, 'metrics.txt'), 'w') as outp: print(metrics, file=outp)
def train( self, train_dataset, output_dir, show_running_loss=True, eval_data=None, verbose=True, **kwargs, ): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ model = self.model args = self.args device = self.device tb_writer = SummaryWriter(logdir=args.tensorboard_dir) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=self.args.dataloader_num_workers, ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = ( args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 ) else: t_total = ( len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs ) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [] custom_parameter_names = set() for group in self.args.custom_parameter_groups: params = group.pop("params") custom_parameter_names.update(params) param_group = {**group} param_group["params"] = [ p for n, p in model.named_parameters() if n in params ] optimizer_grouped_parameters.append(param_group) for group in self.args.custom_layer_parameters: layer_number = group.pop("layer") layer = f"layer.{layer_number}." group_d = {**group} group_nd = {**group} group_nd["weight_decay"] = 0.0 params_d = [] params_nd = [] for n, p in model.named_parameters(): if n not in custom_parameter_names and layer in n: if any(nd in n for nd in no_decay): params_nd.append(p) else: params_d.append(p) custom_parameter_names.add(n) group_d["params"] = params_d group_nd["params"] = params_nd optimizer_grouped_parameters.append(group_d) optimizer_grouped_parameters.append(group_nd) if not self.args.train_custom_parameters_only: optimizer_grouped_parameters.extend( [ { "params": [ p for n, p in model.named_parameters() if n not in custom_parameter_names and not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if n not in custom_parameter_names and any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] ) warmup_steps = math.ceil(t_total * args.warmup_ratio) args.warmup_steps = ( warmup_steps if args.warmup_steps == 0 else args.warmup_steps ) if args.optimizer == "AdamW": optimizer = AdamW( optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, ) elif args.optimizer == "Adafactor": optimizer = Adafactor( optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adafactor_eps, clip_threshold=args.adafactor_clip_threshold, decay_rate=args.adafactor_decay_rate, beta1=args.adafactor_beta1, weight_decay=args.weight_decay, scale_parameter=args.adafactor_scale_parameter, relative_step=args.adafactor_relative_step, warmup_init=args.adafactor_warmup_init, ) print("Using Adafactor for T5") else: raise ValueError( "{} is not a valid optimizer class. Please use one of ('AdamW', 'Adafactor') instead.".format( args.optimizer ) ) if args.scheduler == "constant_schedule": scheduler = get_constant_schedule(optimizer) elif args.scheduler == "constant_schedule_with_warmup": scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps ) elif args.scheduler == "linear_schedule_with_warmup": scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, ) elif args.scheduler == "cosine_schedule_with_warmup": scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, num_cycles=args.cosine_schedule_num_cycles, ) elif args.scheduler == "cosine_with_hard_restarts_schedule_with_warmup": scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, num_cycles=args.cosine_schedule_num_cycles, ) elif args.scheduler == "polynomial_decay_schedule_with_warmup": scheduler = get_polynomial_decay_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, lr_end=args.polynomial_decay_schedule_lr_end, power=args.polynomial_decay_schedule_power, ) else: raise ValueError("{} is not a valid scheduler.".format(args.scheduler)) if ( args.model_name and os.path.isfile(os.path.join(args.model_name, "optimizer.pt")) and os.path.isfile(os.path.join(args.model_name, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name, "optimizer.pt")) ) scheduler.load_state_dict( torch.load(os.path.join(args.model_name, "scheduler.pt")) ) if args.n_gpu > 1: model = torch.nn.DataParallel(model) logger.info(" Training started") global_step = 0 training_progress_scores = None tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( int(args.num_train_epochs), desc="Epoch", disable=args.silent, mininterval=0 ) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 steps_trained_in_current_epoch = 0 epochs_trained = 0 if args.model_name and os.path.exists(args.model_name): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name.split("/")[-1].split("-") if len(checkpoint_suffix) > 2: checkpoint_suffix = checkpoint_suffix[1] else: checkpoint_suffix = checkpoint_suffix[-1] global_step = int(checkpoint_suffix) epochs_trained = global_step // ( len(train_dataloader) // args.gradient_accumulation_steps ) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps ) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info( " Will skip the first %d steps in the current epoch", steps_trained_in_current_epoch, ) except ValueError: logger.info(" Starting fine-tuning.") if args.evaluate_during_training: training_progress_scores = self._create_training_progress_scores(**kwargs) if args.wandb_project: wandb.init( project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs, ) wandb.run._label(repo="simpletransformers") wandb.watch(self.model) if args.fp16: from torch.cuda import amp scaler = amp.GradScaler() for current_epoch in train_iterator: model.train() if epochs_trained > 0: epochs_trained -= 1 continue train_iterator.set_description( f"Epoch {epoch_number + 1} of {args.num_train_epochs}" ) batch_iterator = tqdm( train_dataloader, desc=f"Running Epoch {epoch_number} of {args.num_train_epochs}", disable=args.silent, mininterval=0, ) for step, batch in enumerate(batch_iterator): if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs = self._get_inputs_dict(batch) if args.fp16: with amp.autocast(): outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] else: outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = ( loss.mean() ) # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: batch_iterator.set_description( f"Epochs {epoch_number}/{args.num_train_epochs}. Running Loss: {current_loss:9.4f}" ) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: scaler.scale(loss).backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: scaler.unscale_(optimizer) if args.optimizer == "AdamW": torch.nn.utils.clip_grad_norm_( model.parameters(), args.max_grad_norm ) if args.fp16: scaler.step(optimizer) scaler.update() else: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar( "lr", scheduler.get_last_lr()[0], global_step ) tb_writer.add_scalar( "loss", (tr_loss - logging_loss) / args.logging_steps, global_step, ) logging_loss = tr_loss if args.wandb_project or self.is_sweeping: wandb.log( { "Training loss": current_loss, "lr": scheduler.get_last_lr()[0], "global_step": global_step, } ) if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step) ) self.save_model( output_dir_current, optimizer, scheduler, model=model ) if args.evaluate_during_training and ( args.evaluate_during_training_steps > 0 and global_step % args.evaluate_during_training_steps == 0 ): # Only evaluate when single GPU otherwise metrics may not average well results = self.eval_model( eval_data, verbose=verbose and args.evaluate_during_training_verbose, silent=args.evaluate_during_training_silent, **kwargs, ) for key, value in results.items(): try: tb_writer.add_scalar( "eval_{}".format(key), value, global_step ) except (NotImplementedError, AssertionError): pass output_dir_current = os.path.join( output_dir, "checkpoint-{}".format(global_step) ) if args.save_eval_checkpoints: self.save_model( output_dir_current, optimizer, scheduler, model=model, results=results, ) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join( args.output_dir, "training_progress_scores.csv" ), index=False, ) if args.wandb_project or self.is_sweeping: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args.early_stopping_metric] self.save_model( args.best_model_dir, optimizer, scheduler, model=model, results=results, ) if best_eval_metric and args.early_stopping_metric_minimize: if ( results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta ): best_eval_metric = results[args.early_stopping_metric] self.save_model( args.best_model_dir, optimizer, scheduler, model=model, results=results, ) early_stopping_counter = 0 else: if args.use_early_stopping: if ( early_stopping_counter < args.early_stopping_patience ): early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args.early_stopping_metric}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args.early_stopping_patience}" ) else: if verbose: logger.info( f" Patience of {args.early_stopping_patience} steps reached" ) logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) else: if ( results[args.early_stopping_metric] - best_eval_metric > args.early_stopping_delta ): best_eval_metric = results[args.early_stopping_metric] self.save_model( args.best_model_dir, optimizer, scheduler, model=model, results=results, ) early_stopping_counter = 0 else: if args.use_early_stopping: if ( early_stopping_counter < args.early_stopping_patience ): early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args.early_stopping_metric}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args.early_stopping_patience}" ) else: if verbose: logger.info( f" Patience of {args.early_stopping_patience} steps reached" ) logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) model.train() epoch_number += 1 output_dir_current = os.path.join( output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number) ) if args.save_model_every_epoch or args.evaluate_during_training: os.makedirs(output_dir_current, exist_ok=True) if args.save_model_every_epoch: self.save_model(output_dir_current, optimizer, scheduler, model=model) if args.evaluate_during_training and args.evaluate_each_epoch: results = self.eval_model( eval_data, verbose=verbose and args.evaluate_during_training_verbose, silent=args.evaluate_during_training_silent, **kwargs, ) if args.save_eval_checkpoints: self.save_model( output_dir_current, optimizer, scheduler, results=results ) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args.output_dir, "training_progress_scores.csv"), index=False, ) if args.wandb_project or self.is_sweeping: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args.early_stopping_metric] self.save_model( args.best_model_dir, optimizer, scheduler, model=model, results=results, ) if best_eval_metric and args.early_stopping_metric_minimize: if ( results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta ): best_eval_metric = results[args.early_stopping_metric] self.save_model( args.best_model_dir, optimizer, scheduler, model=model, results=results, ) early_stopping_counter = 0 else: if ( args.use_early_stopping and args.early_stopping_consider_epochs ): if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args.early_stopping_metric}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args.early_stopping_patience}" ) else: if verbose: logger.info( f" Patience of {args.early_stopping_patience} steps reached" ) logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) else: if ( results[args.early_stopping_metric] - best_eval_metric > args.early_stopping_delta ): best_eval_metric = results[args.early_stopping_metric] self.save_model( args.best_model_dir, optimizer, scheduler, model=model, results=results, ) early_stopping_counter = 0 else: if ( args.use_early_stopping and args.early_stopping_consider_epochs ): if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info( f" No improvement in {args.early_stopping_metric}" ) logger.info( f" Current step: {early_stopping_counter}" ) logger.info( f" Early stopping patience: {args.early_stopping_patience}" ) else: if verbose: logger.info( f" Patience of {args.early_stopping_patience} steps reached" ) logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, )
def __new__(cls, optimizer, *args, **kwargs): return get_constant_schedule_with_warmup(optimizer, *args, **kwargs)
def train( self, train_dataloader, output_dir, show_running_loss=True, eval_dataloader=None, verbose=True, **kwargs, ): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ device = self.device model = self.model args = self.args tb_writer = SummaryWriter(logdir=args.tensorboard_dir) t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [] custom_parameter_names = set() for group in self.args.custom_parameter_groups: params = group.pop("params") custom_parameter_names.update(params) param_group = {**group} param_group["params"] = [p for n, p in model.named_parameters() if n in params] optimizer_grouped_parameters.append(param_group) for group in self.args.custom_layer_parameters: layer_number = group.pop("layer") layer = f"layer.{layer_number}." group_d = {**group} group_nd = {**group} group_nd["weight_decay"] = 0.0 params_d = [] params_nd = [] for n, p in model.named_parameters(): if n not in custom_parameter_names and layer in n: if any(nd in n for nd in no_decay): params_nd.append(p) else: params_d.append(p) custom_parameter_names.add(n) group_d["params"] = params_d group_nd["params"] = params_nd optimizer_grouped_parameters.append(group_d) optimizer_grouped_parameters.append(group_nd) if not self.args.train_custom_parameters_only: optimizer_grouped_parameters.extend( [ { "params": [ p for n, p in model.named_parameters() if n not in custom_parameter_names and not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if n not in custom_parameter_names and any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] ) warmup_steps = math.ceil(t_total * args.warmup_ratio) args.warmup_steps = warmup_steps if args.warmup_steps == 0 else args.warmup_steps if args.optimizer == "AdamW": optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) elif args.optimizer == "Adafactor": optimizer = Adafactor( optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adafactor_eps, clip_threshold=args.adafactor_clip_threshold, decay_rate=args.adafactor_decay_rate, beta1=args.adafactor_beta1, weight_decay=args.weight_decay, scale_parameter=args.adafactor_scale_parameter, relative_step=args.adafactor_relative_step, warmup_init=args.adafactor_warmup_init, ) print("Using Adafactor for T5") else: raise ValueError( "{} is not a valid optimizer class. Please use one of ('AdamW', 'Adafactor') instead.".format( args.optimizer ) ) if args.scheduler == "constant_schedule": scheduler = get_constant_schedule(optimizer) elif args.scheduler == "constant_schedule_with_warmup": scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps) elif args.scheduler == "linear_schedule_with_warmup": scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) elif args.scheduler == "cosine_schedule_with_warmup": scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, num_cycles=args.cosine_schedule_num_cycles, ) elif args.scheduler == "cosine_with_hard_restarts_schedule_with_warmup": scheduler = get_cosine_with_hard_restarts_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, num_cycles=args.cosine_schedule_num_cycles, ) elif args.scheduler == "polynomial_decay_schedule_with_warmup": scheduler = get_polynomial_decay_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total, lr_end=args.polynomial_decay_schedule_lr_end, power=args.polynomial_decay_schedule_power, ) else: raise ValueError("{} is not a valid scheduler.".format(args.scheduler)) if args.n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 training_progress_scores = None tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.silent) epoch_number = 0 best_eval_metric = None early_stopping_counter = 0 if args.evaluate_during_training: training_progress_scores = self._create_training_progress_scores(**kwargs) if args.wandb_project: wandb.init(project=args.wandb_project, config={**asdict(args)}, **args.wandb_kwargs) wandb.watch(self.model) if args.fp16: from torch.cuda import amp scaler = amp.GradScaler() for _ in train_iterator: model.train() train_iterator.set_description(f"Epoch {epoch_number + 1} of {args.num_train_epochs}") batch_iterator = tqdm( train_dataloader, desc=f"Running Epoch {epoch_number} of {args.num_train_epochs}", disable=args.silent, mininterval=0, ) for step, batch in enumerate(batch_iterator): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, labels, mc_labels, token_type_ids = batch if args.fp16: with amp.autocast(): outputs = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, labels=labels, ) lm_loss, mc_loss = outputs[:2] # model outputs are always tuple in pytorch-transformers (see doc) loss = lm_loss * args.lm_coef + mc_loss * args.mc_coef else: outputs = model( input_ids, token_type_ids=token_type_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels, labels=labels, ) lm_loss, mc_loss = outputs[:2] # model outputs are always tuple in pytorch-transformers (see doc) loss = lm_loss * args.lm_coef + mc_loss * args.mc_coef if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training current_loss = loss.item() if show_running_loss: print("\rRunning loss: %f" % current_loss, end="") if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: scaler.scale(loss).backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: scaler.unscale_(optimizer) if args.optimizer == "AdamW": torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) if args.fp16: scaler.step(optimizer) scaler.update() else: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.wandb_project or self.is_sweeping: wandb.log( { "Training loss": current_loss, "lr": scheduler.get_last_lr()[0], "global_step": global_step, } ) if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) self.save_model(output_dir_current, model=model) if args.evaluate_during_training and ( args.evaluate_during_training_steps > 0 and global_step % args.evaluate_during_training_steps == 0 ): # Only evaluate when single GPU otherwise metrics may not average well results, _, _ = self.eval_model( eval_dataloader, verbose=verbose and args.evaluate_during_training_verbose, silent=args.evaluate_during_training_silent, **kwargs, ) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step)) if args.save_eval_checkpoints: self.save_model(output_dir_current, model=model, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv( os.path.join(args.output_dir, "training_progress_scores.csv"), index=False, ) if args.wandb_project or self.is_sweeping: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, model=model, results=results) if best_eval_metric and args.early_stopping_metric_minimize: if results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args.early_stopping_metric}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args.early_stopping_patience}") else: if verbose: logger.info(f" Patience of {args.early_stopping_patience} steps reached") logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) else: if results[args.early_stopping_metric] - best_eval_metric > args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, model=model, results=results) early_stopping_counter = 0 else: if args.use_early_stopping: if early_stopping_counter < args.early_stopping_patience: early_stopping_counter += 1 if verbose: logger.info(f" No improvement in {args.early_stopping_metric}") logger.info(f" Current step: {early_stopping_counter}") logger.info(f" Early stopping patience: {args.early_stopping_patience}") else: if verbose: logger.info(f" Patience of {args.early_stopping_patience} steps reached") logger.info(" Training terminated.") train_iterator.close() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, ) epoch_number += 1 output_dir_current = os.path.join(output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number)) if args.save_model_every_epoch or args.evaluate_during_training: os.makedirs(output_dir_current, exist_ok=True) if args.save_model_every_epoch: self.save_model(output_dir_current, model=model) if args.evaluate_during_training and args.evaluate_each_epoch: results, _, _ = self.eval_model( eval_dataloader, verbose=verbose and args.evaluate_during_training_verbose, silent=True, **kwargs, ) self.save_model(output_dir_current, results=results) training_progress_scores["global_step"].append(global_step) training_progress_scores["train_loss"].append(current_loss) for key in results: training_progress_scores[key].append(results[key]) report = pd.DataFrame(training_progress_scores) report.to_csv(os.path.join(args.output_dir, "training_progress_scores.csv"), index=False) if args.wandb_project or self.is_sweeping: wandb.log(self._get_last_metrics(training_progress_scores)) if not best_eval_metric: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, model=model, results=results) if best_eval_metric and args.early_stopping_metric_minimize: if results[args.early_stopping_metric] - best_eval_metric < args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, model=model, results=results) early_stopping_counter = 0 else: if results[args.early_stopping_metric] - best_eval_metric > args.early_stopping_delta: best_eval_metric = results[args.early_stopping_metric] self.save_model(args.best_model_dir, model=model, results=results) early_stopping_counter = 0 model.train() return ( global_step, tr_loss / global_step if not self.args.evaluate_during_training else training_progress_scores, )
# test_data = encode(mrpc_sets['test']) from torch.utils.data import DataLoader train_loader = DataLoader(train_data, batch_size=16) batch = next(iter(train_loader)) for batch, i in enumerate(train_loader): print('bloop') trainer_args = TrainingArguments( output_dir= '/home/ahoffman/research/transformers/examples/alex/tutorials/out', do_train=True, do_eval=True, evaluation_strategy="epoch", # do_predict=True, per_device_train_batch_size=16, per_device_eval_batch_size=64, fp16=False) from transformers.optimization import AdamW, get_constant_schedule_with_warmup optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=.1) scheduler = get_constant_schedule_with_warmup(optimizer, 500) trainer = Trainer(model=model, args=trainer_args, tokenizer=tokenizer, train_dataset=train_data, eval_dataset=valid_data, optimizers=(optimizer, scheduler)) # loader = trainer.train()
def main(args): if not args.do_eval: assert all([ x in ['true', 'false'] for x in [args.use_cuda, args.symmetric, args.linear_head, args.siamese] ]) args.use_cuda = args.use_cuda.lower() == 'true' args.symmetric = args.symmetric.lower() == 'true' args.linear_head = args.linear_head.lower() == 'true' args.siamese = args.siamese.lower() == 'true' if args.siamese: assert args.train_batch_size % 2 == 0, 'train batch size should be even in siamese mode' assert not args.symmetric if args.do_train and os.path.exists(args.output_dir): model_weights = glob(os.path.join(args.output_dir, '*.bin')) if model_weights: print(f'{model_weights}: already computed: skipping ...') return else: print( f'already existing {args.output_dir}. but without model weights ...' ) return device = torch.device("cuda" if args.use_cuda else "cpu") n_gpu = torch.cuda.device_count() if args.gradient_accumulation_steps < 1: raise ValueError( "gradient_accumulation_steps parameter should be >= 1") if args.do_train: random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if args.do_train and not os.path.exists(args.output_dir): os.makedirs(args.output_dir) os.makedirs(os.path.join(args.output_dir, 'nen-nen-weights')) elif args.do_train or args.do_validation: raise ValueError(f'{args.output_dir} already exists') suffix = datetime.now().isoformat().replace('-', '_').replace( ':', '_').split('.')[0].replace('T', '-') if args.do_train: train_writer = SummaryWriter(log_dir=os.path.join( args.output_dir, f'tensorboard-{suffix}', 'train')) dev_writer = SummaryWriter(log_dir=os.path.join( args.output_dir, f'tensorboard-{suffix}', 'dev')) test_writer = SummaryWriter(log_dir=os.path.join( args.output_dir, f'tensorboard-{suffix}', 'test')) logger.addHandler( logging.FileHandler( os.path.join(args.output_dir, f"train_logs_{suffix}.log"), 'w')) else: logger.addHandler( logging.FileHandler( os.path.join(args.ckpt_path, f"eval_logs_{suffix}.log"), 'w')) logger.info(json.dumps(vars(args), indent=4)) if args.do_train: json.dump(vars(args), open(os.path.join(args.output_dir, 'args.json'), 'w'), indent=4) logger.info("device: {}, n_gpu: {}".format(device, n_gpu)) args.train_batch_size = \ args.train_batch_size // args.gradient_accumulation_steps model_name = args.model_name data_processor = DataProcessor() train_dir = os.path.join(args.data_dir, 'train/') dev_dir = os.path.join(args.data_dir, 'dev') if args.do_train: config = configs[args.model_name] config = config.from_pretrained(args.model_name, hidden_dropout_prob=args.dropout) if args.ckpt_path != '': model_path = args.ckpt_path else: model_path = args.model_name model = models[model_name] model = model.from_pretrained( model_path, cache_dir=str(PYTORCH_PRETRAINED_BERT_CACHE), args=args, data_processor=data_processor, config=config) if args.freeze_featurizer: trainable_weights = [] for name, parameter in model.named_parameters(): # if name not in ['syn_clf.bn1.weight', 'syn_clf.bn1.bias', 'syn_clf.bn1.running_mean', 'syn_clf.bn1.running_var', 'syn_clf.dense.weight', 'syn_clf.dense.bias', 'syn_clf.out_proj.weight', 'syn_clf.out_proj.bias']: if name.startswith('roberta'): parameter.requires_grad = False else: trainable_weights.append(name) logger.info(f'trainable weights: {trainable_weights}') model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ param for name, param in param_optimizer if not any(nd in name for nd in no_decay) ], 'weight_decay': float(args.weight_decay) }, { 'params': [ param for name, param in param_optimizer if any(nd in name for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=float(args.learning_rate), eps=1e-6, betas=(0.9, 0.98), correct_bias=True) train_features = model.convert_dataset_to_features(train_dir, logger) train_dataloader = \ get_dataloader_and_tensors(train_features, args.train_batch_size, 'siamese_random' if args.siamese else 'random') train_batches_len = len(train_dataloader) num_train_optimization_steps = \ train_batches_len // args.gradient_accumulation_steps * \ args.num_train_epochs warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) if args.lr_scheduler == 'linear_warmup': scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps) elif args.lr_scheduler == 'constant_warmup': scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) if args.fp16: from apex import amp model, optimizer = amp.initialize( model, optimizer, opt_level=args.fp16_opt_level, # loss_scale=args.loss_scale, # min_loss_scale=args.fp16_min_loss_scale, # max_loss_scale=args.fp16_max_loss_scale, ) logger.info("***** Training *****") logger.info(" Num examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Num steps = %d", num_train_optimization_steps) train_features = None if args.do_validation: dev_features = model.convert_dataset_to_features(dev_dir, logger) logger.info("***** Dev *****") logger.info(" Num examples = %d", len(dev_features)) logger.info(" Batch size = %d", args.eval_batch_size) dev_dataloader = \ get_dataloader_and_tensors(dev_features, args.eval_batch_size, 'sequential') test_dir = os.path.join(args.data_dir, 'test/') if os.path.exists(test_dir): test_features = model.convert_dataset_to_features( test_dir, logger) logger.info("***** Test *****") logger.info(" Num examples = %d", len(test_features)) logger.info(" Batch size = %d", args.eval_batch_size) test_dataloader = \ get_dataloader_and_tensors(test_features, args.eval_batch_size, 'sequential') best_result = defaultdict(float) eval_step = max(1, train_batches_len // args.eval_per_epoch) start_time = time.time() global_step = 0 lr = float(args.learning_rate) for epoch in range(1, 1 + args.num_train_epochs): tr_loss = 0 nb_tr_examples = 0 nb_tr_steps = 0 cur_train_loss = defaultdict(float) model.train() logger.info("Start epoch #{} (lr = {})...".format( epoch, scheduler.get_lr()[0])) train_bar = tqdm(train_dataloader, total=train_batches_len, desc='training ... ') for step, batch in enumerate(train_bar): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, token_type_ids, \ syn_labels, positions = batch train_loss, _ = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=input_mask, input_labels={ 'syn_labels': syn_labels, 'positions': positions }) loss = train_loss['total'].mean().item() for key in train_loss: cur_train_loss[key] += train_loss[key].mean().item() train_bar.set_description( f'training... [epoch == {epoch} / {args.num_train_epochs}, loss == {loss}]' ) loss_to_optimize = train_loss['total'] if args.gradient_accumulation_steps > 1: loss_to_optimize = \ loss_to_optimize / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss_to_optimize, optimizer) as scaled_loss: scaled_loss.backward() else: loss_to_optimize.backward() tr_loss += loss_to_optimize.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # optimizer.zero_grad() model.zero_grad() global_step += 1 if args.do_validation and (step + 1) % eval_step == 0: logger.info( 'Ep: {}, Stp: {}/{}, usd_t={:.2f}s, loss={:.6f}'. format(epoch, step + 1, train_batches_len, time.time() - start_time, tr_loss / nb_tr_steps)) cur_train_mean_loss = {} for key, value in cur_train_loss.items(): cur_train_mean_loss[f'{key}_loss'] = \ value / nb_tr_steps dev_predictions = os.path.join(args.output_dir, 'dev_predictions') metrics = model.predict(dev_dataloader, dev_predictions, dev_features, compute_metrics=True) metrics['global_step'] = global_step metrics['epoch'] = epoch metrics['learning_rate'] = scheduler.get_lr()[0] metrics['batch_size'] = \ args.train_batch_size * args.gradient_accumulation_steps for key, value in metrics.items(): dev_writer.add_scalar(key, value, global_step) for key, value in cur_train_mean_loss.items(): train_writer.add_scalar(key, value, global_step) scores_to_logger = tuple([ round(metrics[save_by_score] * 100.0, 2) for save_by_score in args.save_by_score.split('+') ]) logger.info( f"dev %s (lr=%s, epoch=%d): %s" % (args.save_by_score, str( scheduler.get_lr()[0]), epoch, scores_to_logger)) improved_parts = [ part for part in metrics if part.endswith('.score') and metrics[part] > args.start_save_threshold and metrics[part] > best_result[part] ] if improved_parts: best_dev_predictions = os.path.join( args.output_dir, 'best-dev-predictions') dev_predictions = os.path.join(args.output_dir, 'dev_predictions') os.makedirs(best_dev_predictions, exist_ok=True) os.makedirs(dev_predictions, exist_ok=True) for part in improved_parts: logger.info( "!!! Best dev %s (lr=%s, epoch=%d): %.2f -> %.2f" % (part, str(scheduler.get_lr()[0]), epoch, best_result[part] * 100.0, metrics[part] * 100.0)) best_result[part] = metrics[part] dev_writer.add_scalar('best_' + part, metrics[part], global_step) if [ save_weight for save_weight in args.save_by_score.split('+') if save_weight == part ]: os.makedirs(os.path.join( args.output_dir, part), exist_ok=True) output_model_file = os.path.join( args.output_dir, part, WEIGHTS_NAME) save_model(args, model, output_model_file, metrics) best_dev_files = [ file.split('/')[-1] for file in glob(f'{dev_predictions}/*') if part.split('.')[1] in file ] for dev_file in best_dev_files: logger.info( f'{dev_predictions}/{dev_file} -> {best_dev_predictions}/' ) os.system( f'cp {dev_predictions}/{dev_file} {best_dev_predictions}/' ) if args.log_test_metrics and os.path.exists(test_dir): test_predictions = os.path.join( args.output_dir, 'test_predictions') test_metrics = model.predict(test_dataloader, test_predictions, test_features, compute_metrics=True) best_test_predictions = os.path.join( args.output_dir, 'best-test-predictions') os.makedirs(best_test_predictions, exist_ok=True) corresp_test_files = [ file.split('/')[-1] for file in glob(f'{test_predictions}/*') if any([ part.split('.')[1] in file for part in improved_parts ]) ] for test_file in corresp_test_files: logger.info( f'{test_predictions}/{test_file} -> {best_test_predictions}/' ) os.system( f'cp {test_predictions}/{test_file} {best_test_predictions}/' ) for key, value in test_metrics.items(): if key.endswith('.score'): test_writer.add_scalar( key, value, global_step) if key in improved_parts: test_writer.add_scalar( 'best_' + key, value, global_step) if any([ 'nen-nen.score' in part for part in improved_parts ]): best_dev_nen_nen_path = os.path.join( args.output_dir, 'best-dev-nen-nen-predictions') os.makedirs(best_dev_nen_nen_path, exist_ok=True) os.system( f'mv {dev_predictions}/* {best_dev_nen_nen_path}/' ) if args.log_test_metrics and os.path.exists( test_dir): best_test_nen_nen_path = os.path.join( args.output_dir, 'best-test-nen-nen-predictions') os.makedirs(best_test_nen_nen_path, exist_ok=True) os.system( f'mv {test_predictions}/* {best_test_nen_nen_path}/' ) if args.do_eval: assert args.ckpt_path != '', 'in do_eval mode ckpt_path should be specified' test_dir = args.eval_input_dir config = configs[model_name].from_pretrained(model_name) model = models[model_name] model = model.from_pretrained(args.ckpt_path, args=args, data_processor=data_processor, config=config) model.to(device) test_features = model.convert_dataset_to_features(test_dir, logger) logger.info("***** Test *****") logger.info(" Num examples = %d", len(test_features)) logger.info(" Batch size = %d", args.eval_batch_size) test_dataloader = \ get_dataloader_and_tensors(test_features, args.eval_batch_size, 'sequential') metrics = model.predict(test_dataloader, os.path.join(args.output_dir, args.eval_output_dir), test_features, compute_metrics=True) logger.info(json.dumps(metrics, indent=4)) with open( os.path.join(args.output_dir, args.eval_output_dir, 'metrics.txt'), 'w') as outp: json.dump(metrics, outp, indent=4)
def main(args): device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if args.do_train: random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if args.do_train: suffix = datetime.now().isoformat().replace('-', '_').replace( ':', '_').split('.')[0].replace('T', '-') logger.addHandler( logging.FileHandler( os.path.join(args.output_dir, f"train_{suffix}.log"), 'w')) eval_logger.addHandler( logging.FileHandler( os.path.join(args.output_dir, f"scores_{suffix}.log"), 'w')) else: logger.addHandler( logging.FileHandler(os.path.join(args.output_dir, "eval.log"), 'w')) logger.info(args) logger.info("device: {}, n_gpu: {}".format(device, n_gpu)) processor = DataProcessor(tag_format=args.tag_format, filter_non_causal=args.only_task_2 or args.only_bert_ner) if args.only_task_2 or args.only_bert_ner: model_name = f'{args.model}-fake' assert args.text_clf_weight == 0.0, f"Training only on task 2 requires to set " \ f"text_clf_weight to zero. {args.text_clf_weight} passed." assert args.eval_metric.startswith('sequence'), f"Training only on task 2 requires to set task 2 related " \ f"metric. {args.eval_metric} passed." else: model_name = args.model text_labels_list = processor.get_text_labels(args.data_dir, logger) sequence_labels_list = processor.get_sequence_labels(args.data_dir, logger) label2id = { 'text': {label: i for i, label in enumerate(text_labels_list)}, 'sequence': {label: i for i, label in enumerate(sequence_labels_list, 1)} } id2label = { 'text': {i: label for i, label in enumerate(text_labels_list)}, 'sequence': {i: label for i, label in enumerate(sequence_labels_list, 1)} } num_text_labels = len(text_labels_list) num_sequence_labels = len(sequence_labels_list) + 1 # do_lower_case = 'uncased' in args.model do_lower_case = True tokenizer = tokenizers[args.model].from_pretrained( args.model, do_lower_case=do_lower_case) if args.do_train: config = configs[args.model] config = config.from_pretrained(args.model, hidden_dropout_prob=args.dropout) model = models[model_name].from_pretrained( args.model, cache_dir=str(PYTORCH_PRETRAINED_BERT_CACHE), num_text_labels=num_text_labels, num_sequence_labels=num_sequence_labels, sequence_clf_weight=args.sequence_clf_weight, text_clf_weight=args.text_clf_weight, pooling_type=args.bert_ner_pool_type, config=config) print("text and sequence tasks weights:", model.text_clf_weight, model.sequence_clf_weight) else: model = models[model_name].from_pretrained( args.output_dir, num_sequence_labels=num_sequence_labels, num_text_labels=num_text_labels, text_clf_weight=args.text_clf_weight, sequence_clf_weight=args.sequence_clf_weight, pooling_type=args.bert_ner_pool_type) model.to(device) eval_examples = processor.get_dev_examples(args.data_dir) eval_features = model.convert_examples_to_features(eval_examples, label2id, args.max_seq_length, tokenizer, logger) logger.info("***** Dev *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) eval_dataloader, eval_text_labels_ids, eval_sequence_labels_ids = \ get_dataloader_and_text_ids_with_sequence_ids(eval_features, args.eval_batch_size) if args.do_train: train_examples = processor.get_train_examples(args.data_dir) train_features = model.convert_examples_to_features( train_examples, label2id, args.max_seq_length, tokenizer, logger) if args.train_mode == 'sorted' or args.train_mode == 'random_sorted': train_features = sorted(train_features, key=lambda f: np.sum(f.input_mask)) else: random.shuffle(train_features) train_dataloader, _, _ = \ get_dataloader_and_text_ids_with_sequence_ids(train_features, args.train_batch_size) train_batches = [batch for batch in train_dataloader] num_train_optimization_steps = \ len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) logger.info("***** Training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) best_result = None eval_step = max(1, len(train_batches) // args.eval_per_epoch) lr = float(args.learning_rate) if n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ param for name, param in param_optimizer if not any(nd in name for nd in no_decay) ], 'weight_decay': float(args.weight_decay) }, { 'params': [ param for name, param in param_optimizer if any(nd in name for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=lr) if args.lr_schedule == 'constant_warmup': print('lr schedule = constant_warmup') scheduler = get_constant_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps) else: scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps) start_time = time.time() global_step = 0 tr_loss = 0 nb_tr_examples = 0 nb_tr_steps = 0 for epoch in range(1, 1 + int(args.num_train_epochs)): model.train() logger.info("Start epoch #{} (lr = {})...".format(epoch, lr)) if args.train_mode == 'random' or args.train_mode == 'random_sorted': random.shuffle(train_batches) for step, batch in enumerate( tqdm(train_batches, total=len(train_batches), desc='fitting ... ')): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, text_labels_ids, sequence_labels_ids, token_pos_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, text_labels=text_labels_ids, sequence_labels=sequence_labels_ids, token_pos_ids=token_pos_ids) if n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 if args.do_validate and (step + 1) % eval_step == 0: logger.info( 'Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}' .format(epoch, step + 1, len(train_batches), time.time() - start_time, tr_loss / nb_tr_steps)) save_model = False preds, result, scores = evaluate( model, device, eval_dataloader, eval_text_labels_ids, eval_sequence_labels_ids, num_text_labels, num_sequence_labels, label2id) model.train() result['global_step'] = global_step result['epoch'] = epoch result['learning_rate'] = lr result['batch_size'] = args.train_batch_size if not args.only_task_2 and not args.only_bert_ner: logger.info("First 20 predictions:") for text_pred, text_label in zip( preds['text'][:20], eval_text_labels_ids.numpy()[:20]): sign = u'\u2713' if text_pred == text_label else u'\u2718' logger.info("pred = %s, label = %s %s" % (id2label['text'][text_pred], id2label['text'][text_label], sign)) if (best_result is None) or (result[args.eval_metric] > best_result[args.eval_metric]): best_result = result save_model = True logger.info("!!! Best dev %s (lr=%s, epoch=%d): %.2f" % (args.eval_metric, str(lr), epoch, result[args.eval_metric] * 100.0)) if save_model: model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join( args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) if best_result: output_eval_file = os.path.join( args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(result.keys()): writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_eval: test_file = os.path.join( args.data_dir, 'test.json') if args.test_file == '' else args.test_file eval_examples = processor.get_test_examples(test_file) eval_features = model.convert_examples_to_features( eval_examples, label2id, args.max_seq_length, tokenizer, logger) logger.info("***** Test *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) eval_dataloader, eval_text_labels_ids, eval_sequence_labels_ids = \ get_dataloader_and_text_ids_with_sequence_ids(eval_features, args.eval_batch_size) preds, result, scores = evaluate(model, device, eval_dataloader, eval_text_labels_ids, eval_sequence_labels_ids, num_text_labels, num_sequence_labels, label2id, compute_scores=False) aggregated_results = {} task = "sequence" eval_orig_positions_map = [ ex.orig_positions_map for ex in eval_features ] aggregated_results[task] = [ list(pred[orig_positions]) + [label2id[task]['0']] * (len(ex.tokens) - len(orig_positions)) for pred, orig_positions, ex in zip( preds[task], eval_orig_positions_map, eval_examples) ] aggregated_results[f'{task}_scores'] = [ list(score[orig_positions]) + [0.999] * (len(ex.tokens) - len(orig_positions)) for score, orig_positions, ex in zip( scores[task], eval_orig_positions_map, eval_examples) ] prediction_results = { 'idx': [ex.guid for ex in eval_examples], 'tokens': [' '.join(ex.tokens) for ex in eval_examples], 'sequence_labels': [' '.join(ex.sequence_labels) for ex in eval_examples], 'text_label': [ex.text_label for ex in eval_examples], 'text_pred': [id2label['text'][x] for x in preds['text']], 'sequence_pred': [ ' '.join( [id2label['sequence'][x] if x != 0 else '0' for x in sent]) for sent in aggregated_results['sequence'] ], 'sequence_scores': [ ' '.join([str(score) for score in sent]) for sent in aggregated_results['sequence_scores'] ], 'task_id': [ex.task_id for ex in eval_examples], 'text': [ex.text for ex in eval_examples] } prediction_results = pd.DataFrame(prediction_results) prediction_results.to_csv(os.path.join( args.output_dir, f"{args.test_file.split('/')[-1]}_predictions.tsv"), sep='\t', index=False) with open( os.path.join( args.output_dir, f"{args.test_file.split('/')[-1]}_eval_results.txt"), "w") as f: for key in sorted(result.keys()): f.write("%s = %s\n" % (key, str(result[key])))