def train(cfg): SEED = cfg.values.seed MODEL_NAME = cfg.values.model_name USE_KFOLD = cfg.values.val_args.use_kfold TRAIN_ONLY = cfg.values.train_only seed_everything(SEED) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # model_config_module = getattr(import_module('transformers'), cfg.values.model_arc + 'Config') model_config = AutoConfig.from_pretrained(MODEL_NAME) model_config.num_labels = 42 whole_df = load_data("/opt/ml/input/data/train/train.tsv") additional_df = load_data("/opt/ml/input/data/train/additional_train.tsv") whole_label = whole_df['label'].values # additional_label = additional_df['label'].values if cfg.values.tokenizer_arc: tokenizer_module = getattr(import_module('transformers'), cfg.values.tokenizer_arc) tokenizer = tokenizer_module.from_pretrained(MODEL_NAME) else: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) early_stopping = EarlyStoppingCallback(early_stopping_patience=9999999, early_stopping_threshold=0.001) training_args = TrainingArguments( output_dir=cfg.values.train_args.output_dir, # output directory save_total_limit=cfg.values.train_args. save_total_limit, # number of total save model. save_steps=cfg.values.train_args.save_steps, # model saving step. num_train_epochs=cfg.values.train_args. num_epochs, # total number of training epochs learning_rate=cfg.values.train_args.lr, # learning_rate per_device_train_batch_size=cfg.values.train_args. train_batch_size, # batch size per device during training per_device_eval_batch_size=cfg.values.train_args. eval_batch_size, # batch size for evaluation warmup_steps=cfg.values.train_args. warmup_steps, # number of warmup steps for learning rate scheduler weight_decay=cfg.values.train_args. weight_decay, # strength of weight decay max_grad_norm=cfg.values.train_args.max_grad_norm, logging_dir=cfg.values.train_args. logging_dir, # directory for storing logs logging_steps=cfg.values.train_args.logging_steps, # log saving step. evaluation_strategy=cfg.values.train_args. evaluation_strategy, # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. eval_steps=cfg.values.train_args.eval_steps, # evaluation step. dataloader_num_workers=4, seed=SEED, label_smoothing_factor=cfg.values.train_args.label_smoothing_factor, load_best_model_at_end=True, # metric_for_best_model='accuracy' ) if USE_KFOLD: kfold = StratifiedKFold(n_splits=cfg.values.val_args.num_k) k = 1 for train_idx, val_idx in kfold.split(whole_df, whole_label): print('\n') cpprint('=' * 15 + f'{k}-Fold Cross Validation' + '=' * 15) train_df = whole_df.iloc[train_idx] # train_df = pd.concat((train_df, additional_df)) val_df = whole_df.iloc[val_idx] if cfg.values.model_arc == 'Roberta': tokenized_train = roberta_tokenized_dataset( train_df, tokenizer) tokenized_val = roberta_tokenized_dataset(val_df, tokenizer) else: tokenized_train = tokenized_dataset(train_df, tokenizer) tokenized_val = tokenized_dataset(val_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, train_df['label'].values) RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values) try: if cfg.values.model_name == 'Bert': model = BertForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) else: model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) except: # model_module = getattr(import_module('transformers'), cfg.values.model_arc) model_module = getattr( import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = model_module.from_pretrained(MODEL_NAME, config=model_config) model.parameters model.to(device) training_args.output_dir = cfg.values.train_args.output_dir + f'/{k}fold' training_args.logging_dir = cfg.values.train_args.output_dir + f'/{k}fold' optimizer = MADGRAD(model.parameters(), lr=training_args.learning_rate) total_step = len( RE_train_dataset ) / training_args.per_device_train_batch_size * training_args.num_train_epochs scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function optimizers=optimizers, # callbacks=[early_stopping] ) k += 1 # train model trainer.train() else: cpprint('=' * 20 + f'START TRAINING' + '=' * 20) if not TRAIN_ONLY: train_df, val_df = train_test_split( whole_df, test_size=cfg.values.val_args.test_size, random_state=SEED) # train_df = pd.concat((train_df, additional_df)) if cfg.values.model_arc == 'Roberta': tokenized_train = roberta_tokenized_dataset( train_df, tokenizer) tokenized_val = roberta_tokenized_dataset(val_df, tokenizer) else: tokenized_train = tokenized_dataset(train_df, tokenizer) tokenized_val = tokenized_dataset(val_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, train_df['label'].values) RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values) try: if cfg.values.model_name == 'Bert': model = BertForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) else: model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) except: # model_module = getattr(import_module('transformers'), cfg.values.model_arc) model_module = getattr( import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = model_module.from_pretrained(MODEL_NAME, config=model_config) model.parameters model.to(device) optimizer = transformers.AdamW(model.parameters(), lr=training_args.learning_rate) total_step = len( RE_train_dataset ) / training_args.per_device_train_batch_size * training_args.num_train_epochs # scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation dataset compute_metrics=compute_metrics, # define metrics function optimizers=optimizers, callbacks=[early_stopping]) # train model trainer.train() else: training_args.evaluation_strategy = 'no' if cfg.values.model_arc == 'Roberta': print('Roberta') tokenized_train = roberta_tokenized_dataset( whole_df, tokenizer) else: tokenized_train = tokenized_dataset(whole_df, tokenizer) RE_train_dataset = RE_Dataset(tokenized_train, whole_df['label'].values) try: model = AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, config=model_config) except: # model_module = getattr(import_module('transformers'), cfg.values.model_arc) model_module = getattr( import_module('transformers'), cfg.values.model_arc + 'ForSequenceClassification') model = model_module.from_pretrained(MODEL_NAME, config=model_config) model.parameters model.to(device) training_args.output_dir = cfg.values.train_args.output_dir + '/only_train' training_args.logging_dir = cfg.values.train_args.output_dir + '/only_train' optimizer = AdamP(model.parameters(), lr=training_args.learning_rate) total_step = len( RE_train_dataset ) / training_args.per_device_train_batch_size * training_args.num_train_epochs scheduler = transformers.get_linear_schedule_with_warmup( optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step) optimizers = optimizer, scheduler trainer = Trainer( model= model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset optimizers=optimizers, # callbacks=[early_stopping] ) # train model trainer.train()
def generate_training_args(args, inoculation_step): training_args = TrainingArguments("tmp_trainer") training_args.no_cuda = args.no_cuda training_args.seed = args.seed training_args.do_train = args.do_train training_args.do_eval = args.do_eval training_args.output_dir = os.path.join(args.output_dir, str(inoculation_step)+"-sample") training_args.evaluation_strategy = args.evaluation_strategy # evaluation is done after each epoch training_args.metric_for_best_model = args.metric_for_best_model training_args.greater_is_better = args.greater_is_better training_args.logging_dir = args.logging_dir training_args.task_name = args.task_name training_args.learning_rate = args.learning_rate training_args.per_device_train_batch_size = args.per_device_train_batch_size training_args.per_device_eval_batch_size = args.per_device_eval_batch_size training_args.num_train_epochs = args.num_train_epochs # this is the maximum num_train_epochs, we set this to be 100. training_args.eval_steps = args.eval_steps training_args.logging_steps = args.logging_steps training_args.load_best_model_at_end = args.load_best_model_at_end if args.save_total_limit != -1: # only set if it is specified training_args.save_total_limit = args.save_total_limit import datetime date_time = "{}-{}".format(datetime.datetime.now().month, datetime.datetime.now().day) run_name = "{0}_{1}_{2}_{3}_mlen_{4}_lr_{5}_seed_{6}_metrics_{7}".format( args.run_name, args.task_name, args.model_type, date_time, args.max_seq_length, args.learning_rate, args.seed, args.metric_for_best_model ) training_args.run_name = run_name training_args_dict = training_args.to_dict() # for PR _n_gpu = training_args_dict["_n_gpu"] del training_args_dict["_n_gpu"] training_args_dict["n_gpu"] = _n_gpu HfParser = HfArgumentParser((TrainingArguments)) training_args = HfParser.parse_dict(training_args_dict)[0] if args.model_path == "": args.model_path = args.model_type if args.model_type == "": assert False # you have to provide one of them. # Set seed before initializing model. set_seed(training_args.seed) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") return training_args