def run_experiment(args): logger.info("\n***********************************************" f"\n************* Experiment: {args.task.name} ************" "\n************************************************") ml_logger = MlLogger(tracking_uri=args.logging.mlflow_url) ml_logger.init_experiment( experiment_name=args.logging.mlflow_experiment, run_name=args.logging.mlflow_run_name, nested=args.logging.mlflow_nested, ) validate_args(args) distributed = bool(args.general.local_rank != -1) # Init device and distributed settings device, n_gpu = initialize_device_settings( use_cuda=args.general.cuda, local_rank=args.general.local_rank, fp16=args.general.fp16, ) args.parameter.batch_size = int(args.parameter.batch_size // args.parameter.gradient_accumulation_steps) if n_gpu > 1: args.parameter.batch_size = args.parameter.batch_size * n_gpu set_all_seeds(args.general.seed) # Prepare Data tokenizer = Tokenizer.load(args.parameter.model, do_lower_case=args.parameter.lower_case) processor = Processor.load( tokenizer=tokenizer, max_seq_len=args.parameter.max_seq_len, data_dir=args.general.data_dir, **args.task.toDict( ), # args is of type DotMap and needs conversion to std python dicts ) data_silo = DataSilo( processor=processor, batch_size=args.parameter.batch_size, distributed=distributed, ) class_weights = None if args.parameter.balance_classes: task_names = list(processor.tasks.keys()) if len(task_names) > 1: raise NotImplementedError( f"Balancing classes is currently not supported for multitask experiments. Got tasks: {task_names} " ) class_weights = data_silo.calculate_class_weights( task_name=task_names[0]) model = get_adaptive_model( lm_output_type=args.parameter.lm_output_type, prediction_heads=args.parameter.prediction_head, layer_dims=args.parameter.layer_dims, model=args.parameter.model, device=device, class_weights=class_weights, embeds_dropout_prob=args.parameter.embeds_dropout_prob, ) # Init optimizer # TODO: warmup linear is sometimes NONE depending on fp16 - is there a neater way to handle this? optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=args.parameter.learning_rate, warmup_proportion=args.parameter.warmup_proportion, loss_scale=args.general.loss_scale, fp16=args.general.fp16, n_batches=len(data_silo.loaders["train"]), grad_acc_steps=args.parameter.gradient_accumulation_steps, n_epochs=args.parameter.epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=args.parameter.epochs, n_gpu=n_gpu, grad_acc_steps=args.parameter.gradient_accumulation_steps, fp16=args.general.fp16, local_rank=args.general.local_rank, warmup_linear=warmup_linear, evaluate_every=args.logging.eval_every, device=device, ) model = trainer.train(model) model_name = ( f"{model.language_model.name}-{model.language_model.language}-{args.task.name}" ) processor.save(f"{args.general.output_dir}/{model_name}") model.save(f"{args.general.output_dir}/{model_name}")
def run_experiment(args): logger.info("\n***********************************************" f"\n************* Experiment: {args.task.name} ************" "\n************************************************") ml_logger = MlLogger(tracking_uri=args.logging.mlflow_url) ml_logger.init_experiment( experiment_name=args.logging.mlflow_experiment, run_name=args.logging.mlflow_run_name, nested=args.logging.mlflow_nested, ) validate_args(args) distributed = bool(args.general.local_rank != -1) # Init device and distributed settings device, n_gpu = initialize_device_settings( use_cuda=args.general.cuda, local_rank=args.general.local_rank, use_amp=args.general.use_amp, ) args.parameter.batch_size = int(args.parameter.batch_size // args.parameter.gradient_accumulation_steps) set_all_seeds(args.general.seed) # Prepare Data tokenizer = Tokenizer.load(args.parameter.model, do_lower_case=args.parameter.lower_case) processor = Processor.load( tokenizer=tokenizer, max_seq_len=args.parameter.max_seq_len, data_dir=Path(args.general.data_dir), **args.task.toDict( ), # args is of type DotMap and needs conversion to std python dicts ) data_silo = DataSilo( processor=processor, batch_size=args.parameter.batch_size, distributed=distributed, ) class_weights = None if args.parameter.balance_classes: task_names = list(processor.tasks.keys()) if len(task_names) > 1: raise NotImplementedError( f"Balancing classes is currently not supported for multitask experiments. Got tasks: {task_names} " ) class_weights = data_silo.calculate_class_weights( task_name=task_names[0]) model = get_adaptive_model( lm_output_type=args.parameter.lm_output_type, prediction_heads=args.parameter.prediction_head, layer_dims=args.parameter.layer_dims, model=args.parameter.model, device=device, class_weights=class_weights, embeds_dropout_prob=args.parameter.embeds_dropout_prob, ) # Init optimizer optimizer_opts = args.optimizer.optimizer_opts.toDict( ) if args.optimizer.optimizer_opts else None schedule_opts = args.optimizer.schedule_opts.toDict( ) if args.optimizer.schedule_opts else None model, optimizer, lr_schedule = initialize_optimizer( model=model, learning_rate=args.optimizer.learning_rate, schedule_opts=schedule_opts, optimizer_opts=optimizer_opts, use_amp=args.general.use_amp, n_batches=len(data_silo.loaders["train"]), grad_acc_steps=args.parameter.gradient_accumulation_steps, n_epochs=args.parameter.epochs, device=device) model_name = ( f"{model.language_model.name}-{model.language_model.language}-{args.task.name}" ) # An early stopping instance can be used to save the model that performs best on the dev set # according to some metric and stop training when no improvement is happening for some iterations. if "early_stopping" in args: early_stopping = EarlyStopping( metric=args.task.metric, mode=args.early_stopping.mode, save_dir=Path( f"{args.general.output_dir}/{model_name}_early_stopping" ), # where to save the best model patience=args.early_stopping. patience # number of evaluations to wait for improvement before terminating the training ) else: early_stopping = None trainer = Trainer( model=model, optimizer=optimizer, data_silo=data_silo, epochs=args.parameter.epochs, n_gpu=n_gpu, grad_acc_steps=args.parameter.gradient_accumulation_steps, use_amp=args.general.use_amp, local_rank=args.general.local_rank, lr_schedule=lr_schedule, evaluate_every=args.logging.eval_every, device=device, early_stopping=early_stopping) model = trainer.train() processor.save(Path(f"{args.general.output_dir}/{model_name}")) model.save(Path(f"{args.general.output_dir}/{model_name}")) ml_logger.end_run()
def run_experiment(args): validate_args(args) directory_setup(output_dir=args.output_dir, do_train=args.do_train) distributed = bool(args.local_rank != -1) # Init device and distributed settings device, n_gpu = initialize_device_settings(use_cuda=args.cuda, local_rank=args.local_rank, fp16=args.fp16) args.batch_size = args.batch_size // args.gradient_accumulation_steps if n_gpu > 1: args.batch_size = args.batch_size * n_gpu set_all_seeds(args.seed) # Prepare Data tokenizer = BertTokenizer.from_pretrained(args.model, do_lower_case=args.lower_case) processor = Processor.load( processor_name=args.processor_name, tokenizer=tokenizer, max_seq_len=args.max_seq_len, data_dir=args.data_dir, ) data_silo = DataSilo(processor=processor, batch_size=args.batch_size, distributed=distributed) class_weights = None if args.balance_classes: class_weights = data_silo.class_weights model = get_adaptive_model( lm_output_type=args.lm_output_type, prediction_heads=args.prediction_head, layer_dims=args.layer_dims, model=args.model, device=device, class_weights=class_weights, fp16=args.fp16, embeds_dropout_prob=args.embeds_dropout_prob, local_rank=args.local_rank, n_gpu=n_gpu, ) # Init optimizer # TODO: warmup linear is sometimes NONE depending on fp16 - is there a neater way to handle this? optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=args.learning_rate, warmup_proportion=args.warmup_proportion, loss_scale=args.loss_scale, fp16=args.fp16, n_examples=data_silo.n_samples("train"), batch_size=args.batch_size, grad_acc_steps=args.gradient_accumulation_steps, n_epochs=args.epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=args.epochs, n_gpu=n_gpu, grad_acc_steps=args.gradient_accumulation_steps, fp16=args.fp16, warmup_linear=warmup_linear, evaluate_every=args.eval_every, device=device, ) model = trainer.train(model) model_name = ( f"{model.language_model.name}-{model.language_model.language}-{args.name}" ) processor.save(f"saved_models/{model_name}") model.save(f"saved_models/{model_name}")
def run_experiment(args): validate_args(args) distributed = bool(args.general.local_rank != -1) # Init device and distributed settings device, n_gpu = initialize_device_settings( use_cuda=args.general.cuda, local_rank=args.general.local_rank, fp16=args.general.fp16, ) args.parameter.batch_size = int(args.parameter.batch_size // args.parameter.gradient_accumulation_steps) if n_gpu > 1: args.parameter.batch_size = args.parameter.batch_size * n_gpu set_all_seeds(args.general.seed) # Prepare Data tokenizer = BertTokenizer.from_pretrained( args.parameter.model, do_lower_case=args.parameter.lower_case) # processor = Processor.load( # tokenizer=tokenizer, # max_seq_len=args.parameter.max_seq_len, # data_dir=args.general.data_dir, # train_filename=args.task.train_filename, # dev_filename=args.task.dev_filename, # test_filename=args.task.test_filename, # dev_split=args.task.dev_split, # metrics=args.task.metrics, # **args.task.toDict(), # args is of type DotMap and needs conversion to std python dicts # ) processor = Processor.load( tokenizer=tokenizer, max_seq_len=args.parameter.max_seq_len, data_dir=args.general.data_dir, **args.task.toDict( ), # args is of type DotMap and needs conversion to std python dicts ) data_silo = DataSilo( processor=processor, batch_size=args.parameter.batch_size, distributed=distributed, ) class_weights = None if args.parameter.balance_classes: class_weights = data_silo.class_weights model = get_adaptive_model( lm_output_type=args.parameter.lm_output_type, prediction_heads=args.parameter.prediction_head, layer_dims=args.parameter.layer_dims, model=args.parameter.model, device=device, class_weights=class_weights, embeds_dropout_prob=args.parameter.embeds_dropout_prob, ) # Init optimizer # TODO: warmup linear is sometimes NONE depending on fp16 - is there a neater way to handle this? optimizer, warmup_linear = initialize_optimizer( model=model, learning_rate=args.parameter.learning_rate, warmup_proportion=args.parameter.warmup_proportion, loss_scale=args.general.loss_scale, fp16=args.general.fp16, n_batches=len(data_silo.loaders["train"]), grad_acc_steps=args.parameter.gradient_accumulation_steps, n_epochs=args.parameter.epochs, ) trainer = Trainer( optimizer=optimizer, data_silo=data_silo, epochs=args.parameter.epochs, n_gpu=n_gpu, grad_acc_steps=args.parameter.gradient_accumulation_steps, fp16=args.general.fp16, local_rank=args.general.local_rank, warmup_linear=warmup_linear, evaluate_every=args.logging.eval_every, device=device, ) model = trainer.train(model) model_name = ( f"{model.language_model.name}-{model.language_model.language}-{args.task.name}" ) processor.save(f"{args.general.output_dir}/{model_name}") model.save(f"{args.general.output_dir}/{model_name}")