def _validate(): nonlocal best_kappa if df_valid is not None: valid_metrics, bins, _ = validate() if is_main: epoch_pbar.set_postfix( {k: f'{v:.4f}' for k, v in valid_metrics.items()}) json_log_plots.write_event(run_root, step, **valid_metrics) if valid_metrics['kappa'] > best_kappa: best_kappa = valid_metrics['kappa'] state = { 'weights': model.state_dict(), 'bins': bins, 'metrics': valid_metrics, 'params': params, } torch.save(state, model_path) elif is_main: state = { 'weights': model.state_dict(), 'bins': default_bins(N_CLASSES), 'params': params, } torch.save(state, model_path)
def validate(epoch: int): if not is_main or world_size != 1: return valid_loss = get_valid_loss() json_log_plots.write_event(run_path, step=seen_tokens, valid_loss=valid_loss) log_writer_valid.add_scalar("loss_epoch", valid_loss, epoch) log_writer_valid.add_scalar("perplexity_epoch", np.exp(valid_loss), epoch)
def train(): nonlocal seen_tokens epoch_size = len(train_dataset) // step_tokens * step_tokens pbar = tqdm.trange(epochs, desc='epochs', dynamic_ncols=True, disable=not is_main) # pbar used for epochs # init_epoch_pbar = lambda: tqdm.trange( # epoch_size, dynamic_ncols=True, disable=not is_main) # init_epoch_pbar = lambda: tqdm.trange(epoch_size, disable=not is_main) # epoch_pbar = init_epoch_pbar() # # pbar.update(seen_tokens // epoch_size) # # pbar.refresh() # epoch_pbar.update(seen_tokens % epoch_size) step = 1 loss_per_epoch = [] j = 0 start_time = time.time() while seen_tokens < epochs * epoch_size: if max_tokens and seen_tokens >= max_tokens: print(f'max_tokens {max_tokens} reached, ' f'saving and exiting') save() validate() return train_step() seen_tokens += step_tokens step += 1 # epoch_pbar.update(step_tokens) # epoch_pbar.set_description(f'epoch {1 + seen_tokens // epoch_size}') # epoch_pbar.set_postfix(loss=f'{loss_meter.mean():.2f}') # epoch_pbar.refresh() loss_per_epoch.append(loss_meter.mean()) if step % save_every == 0: save() if is_main and step % log_every == 0: json_log_plots.write_event(dist_run_path, step=seen_tokens, loss=loss_meter.mean()) loss_meter.reset() if step % validate_every == 0: validate() # create a new progress bar for the next epoch if seen_tokens % epoch_size == 0: # pbar.update() # epoch_pbar.close() # epoch_pbar = init_epoch_pbar() valid_loss = get_valid_loss() print(f'epoch: {j} \t train_loss: {np.mean(loss_per_epoch):.3f} \t valid_loss = {valid_loss:.3f} \t time: {(time.time()-start_time):.2f}') j += 1 loss_per_epoch = [] start_time = time.time() # end of training save() validate()
def log_training_loss(_): nonlocal step train_losses.append(trainer.state.output) smoothed_loss = np.mean(train_losses) epoch_pbar.set_postfix(loss=f'{smoothed_loss:.4f}') epoch_pbar.update(1) step += 1 if step % 20 == 0 and output_dir: json_log_plots.write_event(output_dir, step=step * args.batch_size, loss=smoothed_loss)
def log_validation_results(_): nonlocal best_f1 metrics = evaluate() if output_dir: json_log_plots.write_event( output_dir, step=step * args.batch_size, **metrics) if metrics['f1'] > best_f1: best_f1 = metrics['f1'] if output_dir: torch.save(model.state_dict(), output_dir / 'model_best.pth') epochs_pbar.set_postfix({ k: format_value(v) for k, v in metrics.items()})
def train(): nonlocal seen_tokens epoch_size = len(train_dataset) // step_tokens * step_tokens pbar = tqdm.trange(epochs, desc='epochs', dynamic_ncols=True, disable=not is_main) init_epoch_pbar = lambda: tqdm.trange( epoch_size, dynamic_ncols=True, disable=not is_main) epoch_pbar = init_epoch_pbar() pbar.update(seen_tokens // epoch_size) pbar.refresh() epoch_pbar.update(seen_tokens % epoch_size) step = 1 while seen_tokens < epochs * epoch_size: if max_tokens and seen_tokens >= max_tokens: print(f'max_tokens {max_tokens} reached, ' f'saving and exiting') save() validate() return train_step() seen_tokens += step_tokens step += 1 epoch_pbar.update(step_tokens) epoch_pbar.set_description( f'epoch {1 + seen_tokens // epoch_size}') epoch_pbar.set_postfix(loss=f'{loss_meters["loss"].mean():.2f}') epoch_pbar.refresh() if step % save_every == 0: save() if is_main and step % log_every == 0: json_log_plots.write_event( run_path, step=seen_tokens, **{ name: meter.mean() for name, meter in loss_meters.items() }) for meter in loss_meters.values(): meter.reset() if step % validate_every == 0: validate() if seen_tokens % epoch_size == 0: pbar.update() epoch_pbar.close() epoch_pbar = init_epoch_pbar() # end of training save() validate()
def train(): nonlocal step for _ in tqdm.trange(params['epochs'], desc='epoch', dynamic_ncols=True): lr_scheduler.step() pbar = tqdm.tqdm(train_loader, desc='train', dynamic_ncols=True) for batch in pbar: loss_value = train_step(*batch) step += 1 pbar.set_postfix(loss=f'{loss_value:.2f}') json_log_plots.write_event(run_path, step * params['batch_size'], loss=loss_value) if (params['validate_every'] and step % params['validate_every'] == 0): validate() save() validate()
def train_epoch(epoch): nonlocal step model.train() report_freq = 5 running_losses = [] train_loader = make_loader(df_train, args.batch_size, training=True) if args.ddp: train_loader.sampler.set_epoch(epoch) pbar = tqdm.tqdm(train_loader, dynamic_ncols=True, desc='train', disable=not is_main) optimizer.zero_grad() for i, (ids, xs, ys) in enumerate(pbar): step += len(ids) * n_devices with amp.autocast(enabled=amp_enabled): _, loss = forward(xs, ys) scaler.scale(loss).backward() if (i + 1) % args.grad_acc == 0: scaler.step(optimizer) scaler.update() optimizer.zero_grad() running_losses.append(float(loss)) if lr_scheduler_per_step: try: lr_scheduler.step() except ValueError as e: print(e) if i and i % report_freq == 0: mean_loss = np.mean(running_losses) running_losses.clear() pbar.set_postfix({'loss': f'{mean_loss:.4f}'}) json_log_plots.write_event(run_root, step, loss=mean_loss) pbar.close() if lr_scheduler is not None and not lr_scheduler_per_step: lr_scheduler.step()
def train(): # Turn on training mode which enables dropout. global train_loss, best_val_loss, eval_start_time, log_start_time, corpus, n_batches_per_epoch, max_step model.train() mems = tuple() sample_from_batch = 0 for train_step in range(n_restart_step, max_step): data, target = corpus.get_batch('train', train_step) model.zero_grad() # logging ("=====>", data, target, seq_len) # logging ("%%%%%% data.shape=", data.shape, "target.shape=", target.shape) # logging ( list(zip(data[:, 3].tolist(), corpus.vocab.get_symbols(data[:, 3]) )) ) # logging ( "====> data :", data[:, 3] ) # logging ( "====> target :", target[:, 3] ) # print ( "====> data [batch #%02d]: %s" % (sample_from_batch, ' '.join(corpus.vocab.get_symbols(data[:, sample_from_batch]))[:120] )) # print ( "====> data [batch #%02d]: %s" % (sample_from_batch, ' '.join(corpus.vocab.get_symbols(data[:, sample_from_batch])) )) # print ( "====> target [batch #%02d]: %s" % (sample_from_batch, ' '.join(corpus.vocab.get_symbols(target[:, sample_from_batch]))[:120] )) ret = para_model(data, target, *mems) loss, mems = ret[0], ret[1:] loss = loss.float().mean().type_as(loss) loss.backward() train_loss += loss.float().item() torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) optimizer.step() if args.sample_softmax > 0: optimizer_sparse.step() # step-wise learning rate annealing if args.scheduler in ['cosine', 'constant', 'dev_perf']: # linear warmup stage if train_step < args.warmup_step: curr_lr = args.lr * train_step / args.warmup_step optimizer.param_groups[0]['lr'] = curr_lr if args.sample_softmax > 0: optimizer_sparse.param_groups[0]['lr'] = curr_lr * 2 else: if args.scheduler == 'cosine': scheduler.step(train_step) if args.sample_softmax > 0: scheduler_sparse.step(train_step) elif args.scheduler == 'inv_sqrt': scheduler.step(train_step) if (train_step > n_restart_step) and (train_step % args.log_interval == 0): cur_loss = train_loss / args.log_interval elapsed = time.time() - log_start_time epoch = train_step / n_batches_per_epoch + 1 log_str = '| epoch %2d/%2d batch %6d/%6d [%7.2f%%] | lr %.3g | ms/batch %5.2f | loss %7.3f' % ( epoch, args.num_epochs, train_step % n_batches_per_epoch, n_batches_per_epoch, train_step * 100.0 / max_step, optimizer.param_groups[0]['lr'], elapsed * 1000 / args.log_interval, cur_loss) log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss)) logging(log_str) json_log_plots.write_event(Path(args.work_dir), step=train_step, loss=cur_loss, lr=optimizer.param_groups[0]['lr'] * 100000.0) train_loss = 0 log_start_time = time.time() sample_from_batch = random.randint(0, args.batch_size - 1) if (train_step > n_restart_step) and (train_step % args.eval_interval == 0): save_model('cur') with open(n_steps_txt_path, 'w') as f: f.write("%s\n" % train_step) logging("evaluating model...") val_loss = evaluate('valid') json_log_plots.write_event(Path(args.work_dir), step=train_step, val_loss=val_loss) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: logging("best valid loss so far.") save_model('valid') best_val_loss = val_loss
def validate(): if not is_main or world_size != 1: return json_log_plots.write_event(run_path, step=seen_tokens, valid_loss=get_valid_loss())
def main(): parser = argparse.ArgumentParser(description=__doc__) arg = parser.add_argument arg('--model', default='fasterrcnn_resnet50_fpn', help='model') arg('--device', default='cuda', help='device') arg('--batch-size', default=16, type=int) arg('--workers', default=4, type=int, help='number of data loading workers') arg('--lr', default=0.01, type=float, help='initial learning rate') arg('--momentum', default=0.9, type=float, help='momentum') arg('--wd', '--weight-decay', default=1e-4, type=float, help='weight decay (default: 1e-4)', dest='weight_decay') arg('--epochs', default=45, type=int, help='number of total epochs to run') arg('--lr-steps', default=[35], nargs='+', type=int, help='decrease lr every step-size epochs') arg('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma') arg('--cosine', type=int, default=0, help='cosine lr schedule (disabled step lr schedule)') arg('--print-freq', default=100, type=int, help='print frequency') arg('--output-dir', help='path where to save') arg('--resume', help='resume from checkpoint') arg('--test-only', help='Only test the model', action='store_true') arg('--submission', help='Create test predictions', action='store_true') arg('--pretrained', type=int, default=0, help='Use pre-trained models from the modelzoo') arg('--score-threshold', type=float, default=0.5) arg('--nms-threshold', type=float, default=0.25) arg('--repeat-train-step', type=int, default=2) # fold parameters arg('--fold', type=int, default=0) arg('--n-folds', type=int, default=5) # distributed training parameters arg('--world-size', default=1, type=int, help='number of distributed processes') arg('--dist-url', default='env://', help='url used to set up distributed training') args = parser.parse_args() if args.test_only and args.submission: parser.error('pass one of --test-only and --submission') output_dir = Path(args.output_dir) if args.output_dir else None if output_dir: output_dir.mkdir(parents=True, exist_ok=True) utils.init_distributed_mode(args) print(args) device = torch.device(args.device) # Data loading code print('Loading data') df_train, df_valid = load_train_valid_df(args.fold, args.n_folds) root = TRAIN_ROOT if args.submission: df_valid = pd.read_csv(DATA_ROOT / 'sample_submission.csv') df_valid['labels'] = '' root = TEST_ROOT dataset = Dataset(df_train, get_transform(train=True), root, skip_empty=False) dataset_test = Dataset(df_valid, get_transform(train=False), root, skip_empty=False) print('Creating data loaders') if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( dataset) test_sampler = \ torch.utils.data.distributed.DistributedSampler(dataset_test) else: train_sampler = torch.utils.data.RandomSampler(dataset) test_sampler = torch.utils.data.SequentialSampler(dataset_test) train_batch_sampler = torch.utils.data.BatchSampler(train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) print('Creating model') model = build_model(args.model, args.pretrained, args.nms_threshold) model.to(device) model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) lr_scheduler = None if args.cosine: lr_scheduler = CosineAnnealingLR(optimizer, args.epochs) elif args.lr_steps: lr_scheduler = MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') if 'model' in checkpoint: model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) if lr_scheduler and 'lr_scheduler' in checkpoint: lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) else: model_without_ddp.load_state_dict(checkpoint) print(f'Loaded from checkpoint {args.resume}') def save_eval_results(er): scores, clf_gt = er if output_dir: pd.DataFrame(scores).to_csv(output_dir / 'eval.csv', index=None) pd.DataFrame(clf_gt).to_csv(output_dir / 'clf_gt.csv', index=None) if args.test_only or args.submission: _, eval_results = evaluate(model, data_loader_test, device=device, output_dir=output_dir, threshold=args.score_threshold) if args.test_only: save_eval_results(eval_results) elif output_dir: pd.DataFrame(eval_results[1]).to_csv(output_dir / 'test_predictions.csv', index=None) return print('Start training') best_f1 = 0 start_time = time.time() for epoch in range(args.epochs): if args.distributed: train_sampler.set_epoch(epoch) for _ in range(args.repeat_train_step): train_metrics = train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq) if lr_scheduler: lr_scheduler.step() if output_dir: json_log_plots.write_event(output_dir, step=epoch, **train_metrics) utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': (lr_scheduler.state_dict() if lr_scheduler else None), 'args': args }, output_dir / 'checkpoint.pth') # evaluate after every epoch eval_metrics, eval_results = evaluate(model, data_loader_test, device=device, output_dir=None, threshold=args.score_threshold) save_eval_results(eval_results) if output_dir: json_log_plots.write_event(output_dir, step=epoch, **eval_metrics) if eval_metrics['f1'] > best_f1: best_f1 = eval_metrics['f1'] print(f'Updated best model with f1 of {best_f1}') utils.save_on_master(model_without_ddp.state_dict(), output_dir / 'model_best.pth') total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_corpus", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_corpus) train_dataset = BERTDataset(args.train_corpus, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if not args.do_train: return def save(): # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) global_step = 0 logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=2) model.train() nb_tr_examples, nb_tr_steps = 0, 0 try: for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_losses = deque(maxlen=20) pbar = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(pbar): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch loss = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 tr_losses.append(loss.item()) pbar.set_postfix(loss=f'{np.mean(tr_losses):.4f}') if (step + 1) % 20 == 0: json_log_plots.write_event(Path(args.output_dir), nb_tr_examples, loss=np.mean(tr_losses)) if (step + 1) % 10000 == 0: save() except KeyboardInterrupt: print('Ctrl+C pressed, saving checkpoint') save() raise save()
def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('run_root') arg('--train-size', type=int) arg('--valid-size', type=int) arg('--test-size', type=int) arg('--model', default='bert-base-uncased') arg('--train-seq-length', type=int, default=224) arg('--test-seq-length', type=int, default=296) arg('--epochs', type=int, default=2) arg('--validation', action='store_true') arg('--submission', action='store_true') arg('--lr', type=float, default=2e-5) arg('--batch-size', type=int, default=32) arg('--accumulation-steps', type=int, default=2) arg('--checkpoint-interval', type=int) arg('--clean', action='store_true') arg('--fold', type=int, default=0) arg('--bucket', type=int, default=1) arg('--load-weights', help='load weights for training') arg('--export', help='export everything for inference') args = parser.parse_args() run_root = Path(args.run_root) do_train = not (args.submission or args.validation or args.export) if do_train: if args.clean and run_root.exists(): if input(f'Clean "{run_root.absolute()}"? ') == 'y': shutil.rmtree(run_root) if run_root.exists(): parser.error(f'{run_root} exists') run_root.mkdir(exist_ok=True, parents=True) params_str = json.dumps(vars(args), indent=4) print(params_str) (run_root / 'params.json').write_text(params_str) shutil.copy(__file__, run_root) else: run_root.mkdir(exist_ok=True, parents=True) use_bert = 'bert' in args.model use_gpt2 = 'gpt2' in args.model if args.export: if ((use_bert and 'bert' not in args.export) or (use_gpt2 and 'gpt2' not in args.export)): parser.error("Can't determine model kind from the --export option") print('Loading tokenizer...') if use_bert: tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case='uncased' in args.model) pad_idx = 0 elif use_gpt2: tokenizer = GPT2Tokenizer.from_pretrained(args.model) tokenizer.set_special_tokens([GPT2_PAD]) pad_idx, = tokenizer.convert_tokens_to_ids([GPT2_PAD]) else: raise ValueError(f'Unexpected model {args.model}') print('Loading model...') model_is_path = Path(args.model).exists() num_labels = 7 if use_bert: model = BertForSequenceClassification.from_pretrained( args.model, num_labels=num_labels) else: model = GPT2ClassificationHeadModel(args.model, num_labels=num_labels) model.transformer.set_num_special_tokens(1) if model_is_path: # to also load linear layer weights model.load_state_dict( torch.load(Path(args.model) / 'pytorch_model.bin')) model_path = run_root / 'model.pt' optimizer_path = run_root / 'optimizer.pt' best_model_path = run_root / 'model-best.pt' valid_predictions_path = run_root / 'valid-predictions.csv' if args.export: model.load_state_dict(torch.load(best_model_path)) export_path = Path(args.export) export_path.mkdir(exist_ok=True, parents=True) torch.save(model.state_dict(), export_path / WEIGHTS_NAME) model.config.to_json_file(export_path / CONFIG_NAME) tokenizer.save_vocabulary(export_path) return model = model.to(device) if args.submission: if not model_is_path: model.load_state_dict(torch.load(best_model_path)) if amp is not None: model = amp.initialize(model, opt_level='O1', verbosity=0) make_submission(model=model, tokenizer=tokenizer, run_root=run_root, max_seq_length=args.test_seq_length, batch_size=args.batch_size, pad_idx=pad_idx, use_bert=use_bert, bucket=args.bucket, test_size=args.test_size) return train_pkl_path = DATA_ROOT / 'train.pkl' if not train_pkl_path.exists(): pd.read_csv(DATA_ROOT / 'train.csv').to_pickle(train_pkl_path) df = pd.read_pickle(train_pkl_path) df = preprocess_df(df) folds = json.loads((DATA_ROOT / 'folds.json').read_text()) valid_index = df['id'].isin(folds[args.fold]) df_train, df_valid = df[~valid_index], df[valid_index] if args.train_size and len(df_train) > args.train_size: df_train = df_train.sample(n=args.train_size, random_state=42) if args.valid_size and len(df_valid) > args.valid_size: df_valid = df_valid.sample(n=args.valid_size, random_state=42) x_valid = tokenize_lines(df_valid.pop('comment_text'), args.test_seq_length, tokenizer, use_bert=use_bert, pad_idx=pad_idx) if args.bucket: indices, x_valid = sorted_by_length(x_valid, pad_idx) # TODO recover original order before saving df_valid = df_valid.iloc[indices] y_valid, _ = get_target(df_valid) y_train, loss_weight = get_target(df_train) print(f'X_valid.shape={x_valid.shape} y_valid.shape={y_valid.shape}') criterion = partial(get_loss, loss_weight=loss_weight) def _run_validation(): return validation(model=model, criterion=criterion, x_valid=x_valid, y_valid=y_valid, df_valid=df_valid, batch_size=args.batch_size, pad_idx=pad_idx, bucket=args.bucket) if args.validation: if not model_is_path: model.load_state_dict(torch.load(best_model_path)) if amp is not None: model = amp.initialize(model, opt_level='O1', verbosity=0) metrics, valid_predictions = _run_validation() for k, v in metrics.items(): if isinstance(v, float): print(f'{v:.4f} {k}') valid_predictions.to_csv(valid_predictions_path, index=None) print(f'Saved validation predictions to {valid_predictions_path}') return def _save(step, model, optimizer): torch.save(model.state_dict(), model_path) torch.save({ 'optimizer': optimizer.state_dict(), 'step': step }, optimizer_path) if args.load_weights: print(f'Loading weights from {args.load_weights}') load_info = model.load_state_dict(torch.load(args.load_weights), strict=False) if load_info: print(load_info) x_train = tokenize_lines(df_train.pop('comment_text'), args.train_seq_length, tokenizer, use_bert=use_bert, pad_idx=pad_idx) print(f'X_train.shape={x_train.shape} y_train.shape={y_train.shape}') best_auc = 0 step = optimizer = None try: for model, optimizer, epoch_pbar, loss, step in train( model=model, criterion=criterion, x_train=x_train, y_train=y_train, epochs=args.epochs, yield_steps=args.checkpoint_interval or len(y_valid) // 8, bucket=args.bucket, lr=args.lr, batch_size=args.batch_size, accumulation_steps=args.accumulation_steps, pad_idx=pad_idx, ): if step == 0: continue # step 0 allows saving on Ctrl+C from the start _save(step, model, optimizer) metrics, valid_predictions = _run_validation() metrics['loss'] = loss if metrics['auc'] > best_auc: best_auc = metrics['auc'] shutil.copy(model_path, best_model_path) valid_predictions.to_csv(valid_predictions_path, index=None) epoch_pbar.set_postfix(valid_loss=f'{metrics["valid_loss"]:.4f}', auc=f'{metrics["auc"]:.4f}') json_log_plots.write_event(run_root, step=step, **metrics) except KeyboardInterrupt: if step is not None and optimizer is not None: print('Ctrl+C pressed, saving checkpoint') _save(step, model, optimizer) raise
def validate(): json_log_plots.write_event(run_path, step * params['batch_size'], **get_validation_metrics())
def train(): nonlocal seen_tokens epoch_size = len(train_dataset) // step_tokens * step_tokens pbar = tqdm.trange(epochs, desc="epochs", dynamic_ncols=True, disable=not is_main) init_epoch_pbar = lambda: tqdm.trange(epoch_size, dynamic_ncols=True, disable=not is_main) epoch_pbar = init_epoch_pbar() pbar.update(seen_tokens // epoch_size) pbar.refresh() epoch_pbar.update(seen_tokens % epoch_size) step = 0 epoch, train_loss = 0, 0.0 # context_gen = _gen_training_batch(train_dataset, n_ctx=n_ctx, batch_size=batch_size * accum_gradients) context = None avg_epoch_loss, avg_epoch_perplexity = [], [] while seen_tokens < epochs * epoch_size: if max_tokens and seen_tokens >= max_tokens: print(f"max_tokens {max_tokens} reached, " f"saving and exiting") save() validate(epoch) return # context = torch.LongTensor(next(context_gen)) # TODO GSBATCH train_step(context) seen_tokens += step_tokens step += 1 epoch_pbar.update(step_tokens) epoch_pbar.set_description(f"epoch {1 + epoch}") epoch_pbar.set_postfix(loss=f"{loss_meter.mean():.2f}") epoch_pbar.refresh() if step % save_every == 0: save() if (epoch + 1) % checkpoint_every == 0: save(f"model-{epoch}epochs.pt") if is_main and step % log_every == 0: train_loss = loss_meter.mean() json_log_plots.write_event(run_path, step=seen_tokens, loss=train_loss) loss_meter.reset() avg_epoch_loss.append(train_loss) avg_epoch_perplexity.append(np.exp(train_loss)) log_writer_train.add_scalar("loss_iter", float(train_loss), seen_tokens) log_writer_train.add_scalar("perplexity_iter", float(np.exp(train_loss)), seen_tokens) if step % validate_every == 0: validate(epoch) if seen_tokens % epoch_size == 0: pbar.update() epoch_pbar.close() epoch_pbar = init_epoch_pbar() if is_main: log_writer_train.add_scalar("loss_epoch", sum(avg_epoch_loss) / len(avg_epoch_loss), epoch) log_writer_train.add_scalar( "perplexity_epoch", sum(avg_epoch_perplexity) / len(avg_epoch_perplexity), epoch ) avg_epoch_loss.clear() avg_epoch_perplexity.clear() epoch += 1 # end of training save() validate(epoch)