class NewsClassifier(nn.Module): def __init__(self, args): super(NewsClassifier, self).__init__() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.PRE_TRAINED_MODEL_NAME = "bert-base-uncased" self.EPOCHS = args.max_epochs self.df = None self.tokenizer = None self.df_train = None self.df_val = None self.df_test = None self.train_data_loader = None self.val_data_loader = None self.test_data_loader = None self.optimizer = None self.total_steps = None self.scheduler = None self.loss_fn = None self.BATCH_SIZE = 16 self.MAX_LEN = 160 self.NUM_SAMPLES_COUNT = args.num_samples n_classes = len(class_names) self.VOCAB_FILE_URL = args.vocab_file self.VOCAB_FILE = "bert_base_uncased_vocab.txt" self.drop = nn.Dropout(p=0.2) self.bert = BertModel.from_pretrained(self.PRE_TRAINED_MODEL_NAME) for param in self.bert.parameters(): param.requires_grad = False self.fc1 = nn.Linear(self.bert.config.hidden_size, 512) self.out = nn.Linear(512, n_classes) def forward(self, input_ids, attention_mask): """ :param input_ids: Input sentences from the batch :param attention_mask: Attention mask returned by the encoder :return: output - label for the input text """ pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output output = F.relu(self.fc1(pooled_output)) output = self.drop(output) output = self.out(output) return output @staticmethod def process_label(rating): rating = int(rating) return rating - 1 def create_data_loader(self, df, tokenizer, max_len, batch_size): """ :param df: DataFrame input :param tokenizer: Bert tokenizer :param max_len: maximum length of the input sentence :param batch_size: Input batch size :return: output - Corresponding data loader for the given input """ ds = AGNewsDataset( reviews=df.description.to_numpy(), targets=df.label.to_numpy(), tokenizer=tokenizer, max_len=max_len, ) return DataLoader(ds, batch_size=batch_size, num_workers=4) def prepare_data(self): """ Creates train, valid and test dataloaders from the csv data """ td.AG_NEWS(root="data", split=("train", "test")) extracted_files = os.listdir("data/AG_NEWS") train_csv_path = None for fname in extracted_files: if fname.endswith("train.csv"): train_csv_path = os.path.join(os.getcwd(), "data/AG_NEWS", fname) self.df = pd.read_csv(train_csv_path) self.df.columns = ["label", "title", "description"] self.df.sample(frac=1) self.df = self.df.iloc[:self.NUM_SAMPLES_COUNT] self.df["label"] = self.df.label.apply(self.process_label) if not os.path.isfile(self.VOCAB_FILE): filePointer = requests.get(self.VOCAB_FILE_URL, allow_redirects=True) if filePointer.ok: with open(self.VOCAB_FILE, "wb") as f: f.write(filePointer.content) else: raise RuntimeError("Error in fetching the vocab file") self.tokenizer = BertTokenizer(self.VOCAB_FILE) RANDOM_SEED = 42 np.random.seed(RANDOM_SEED) torch.manual_seed(RANDOM_SEED) self.df_train, self.df_test = train_test_split( self.df, test_size=0.1, random_state=RANDOM_SEED, stratify=self.df["label"]) self.df_val, self.df_test = train_test_split( self.df_test, test_size=0.5, random_state=RANDOM_SEED, stratify=self.df_test["label"]) self.train_data_loader = self.create_data_loader( self.df_train, self.tokenizer, self.MAX_LEN, self.BATCH_SIZE) self.val_data_loader = self.create_data_loader(self.df_val, self.tokenizer, self.MAX_LEN, self.BATCH_SIZE) self.test_data_loader = self.create_data_loader( self.df_test, self.tokenizer, self.MAX_LEN, self.BATCH_SIZE) def setOptimizer(self): """ Sets the optimizer and scheduler functions """ self.optimizer = AdamW(model.parameters(), lr=1e-3, correct_bias=False) self.total_steps = len(self.train_data_loader) * self.EPOCHS self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=0, num_training_steps=self.total_steps) self.loss_fn = nn.CrossEntropyLoss().to(self.device) def startTraining(self, model): """ Initialzes the Traning step with the model initialized :param model: Instance of the NewsClassifier class """ history = defaultdict(list) best_accuracy = 0 for epoch in range(self.EPOCHS): print(f"Epoch {epoch + 1}/{self.EPOCHS}") train_acc, train_loss = self.train_epoch(model) print(f"Train loss {train_loss} accuracy {train_acc}") val_acc, val_loss = self.eval_model(model, self.val_data_loader) print(f"Val loss {val_loss} accuracy {val_acc}") history["train_acc"].append(train_acc) history["train_loss"].append(train_loss) history["val_acc"].append(val_acc) history["val_loss"].append(val_loss) if val_acc > best_accuracy: torch.save(model.state_dict(), "best_model_state.bin") best_accuracy = val_acc def train_epoch(self, model): """ Training process happens and accuracy is returned as output :param model: Instance of the NewsClassifier class :result: output - Accuracy of the model after training """ model = model.train() losses = [] correct_predictions = 0 for data in tqdm(self.train_data_loader): input_ids = data["input_ids"].to(self.device) attention_mask = data["attention_mask"].to(self.device) targets = data["targets"].to(self.device) outputs = model(input_ids=input_ids, attention_mask=attention_mask) _, preds = torch.max(outputs, dim=1) loss = self.loss_fn(outputs, targets) correct_predictions += torch.sum(preds == targets) losses.append(loss.item()) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) self.optimizer.step() self.scheduler.step() self.optimizer.zero_grad() return ( correct_predictions.double() / len(self.train_data_loader) / self.BATCH_SIZE, np.mean(losses), ) def eval_model(self, model, data_loader): """ Validation process happens and validation / test accuracy is returned as output :param model: Instance of the NewsClassifier class :param data_loader: Data loader for either test / validation dataset :result: output - Accuracy of the model after testing """ model = model.eval() losses = [] correct_predictions = 0 with torch.no_grad(): for d in data_loader: input_ids = d["input_ids"].to(self.device) attention_mask = d["attention_mask"].to(self.device) targets = d["targets"].to(self.device) outputs = model(input_ids=input_ids, attention_mask=attention_mask) _, preds = torch.max(outputs, dim=1) loss = self.loss_fn(outputs, targets) correct_predictions += torch.sum(preds == targets) losses.append(loss.item()) return correct_predictions.double() / len( data_loader) / self.BATCH_SIZE, np.mean(losses) def get_predictions(self, model, data_loader): """ Prediction after the training step is over :param model: Instance of the NewsClassifier class :param data_loader: Data loader for either test / validation dataset :result: output - Returns prediction results, prediction probablities and corresponding values """ model = model.eval() review_texts = [] predictions = [] prediction_probs = [] real_values = [] with torch.no_grad(): for d in data_loader: texts = d["review_text"] input_ids = d["input_ids"].to(self.device) attention_mask = d["attention_mask"].to(self.device) targets = d["targets"].to(self.device) outputs = model(input_ids=input_ids, attention_mask=attention_mask) _, preds = torch.max(outputs, dim=1) probs = F.softmax(outputs, dim=1) review_texts.extend(texts) predictions.extend(preds) prediction_probs.extend(probs) real_values.extend(targets) predictions = torch.stack(predictions).cpu() prediction_probs = torch.stack(prediction_probs).cpu() real_values = torch.stack(real_values).cpu() return review_texts, predictions, prediction_probs, real_values
def train(args, train_dataset, model, tokenizer, teacher=None): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() if teacher is not None: teacher.eval() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type != "distilbert": inputs[ "token_type_ids"] = None if args.model_type == "xlm" else batch[ 2] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) outputs = model(**inputs) loss, start_logits_stu, end_logits_stu = outputs # Distillation loss if teacher is not None: if "token_type_ids" not in inputs: inputs[ "token_type_ids"] = None if args.teacher_type == "xlm" else batch[ 2] with torch.no_grad(): start_logits_tea, end_logits_tea = teacher( input_ids=inputs["input_ids"], token_type_ids=inputs["token_type_ids"], attention_mask=inputs["attention_mask"], ) assert start_logits_tea.size() == start_logits_stu.size() assert end_logits_tea.size() == end_logits_stu.size() loss_fct = nn.KLDivLoss(reduction="batchmean") loss_start = loss_fct( F.log_softmax(start_logits_stu / args.temperature, dim=-1), F.softmax(start_logits_tea / args.temperature, dim=-1), ) * (args.temperature**2) loss_end = loss_fct( F.log_softmax(end_logits_stu / args.temperature, dim=-1), F.softmax(end_logits_tea / args.temperature, dim=-1), ) * (args.temperature**2) loss_ce = (loss_start + loss_end) / 2.0 loss = args.alpha_ce * loss_ce + args.alpha_squad * loss if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(): # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. args = parse_args() distributed_args = accelerate.DistributedDataParallelKwargs( find_unused_parameters=True) accelerator = Accelerator(kwargs_handlers=[distributed_args]) device = accelerator.device # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", filename=f'xmc_{args.dataset}_{args.mode}_{args.log}.log', datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) ch = logging.StreamHandler(sys.stdout) logger.addHandler(ch) if accelerator.is_local_main_process: transformers.utils.logging.set_verbosity_info() else: transformers.utils.logging.set_verbosity_error() logger.info(sent_trans.__file__) # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Load pretrained model and tokenizer if args.model_name_or_path == 'bert-base-uncased' or args.model_name_or_path == 'sentence-transformers/paraphrase-mpnet-base-v2': query_encoder = build_encoder( args.model_name_or_path, args.max_label_length, args.pooling_mode, args.proj_emb_dim, ) else: query_encoder = sent_trans.SentenceTransformer(args.model_name_or_path) tokenizer = query_encoder._first_module().tokenizer block_encoder = query_encoder model = DualEncoderModel(query_encoder, block_encoder, args.mode) model = model.to(device) # the whole label set data_path = os.path.join(os.path.abspath(os.getcwd()), 'dataset', args.dataset) all_labels = pd.read_json(os.path.join(data_path, 'lbl.json'), lines=True) label_list = list(all_labels.title) label_ids = list(all_labels.uid) label_data = SimpleDataset(label_list, transform=tokenizer.encode) # label dataloader for searching sampler = SequentialSampler(label_data) label_padding_func = lambda x: padding_util(x, tokenizer.pad_token_id, 64) label_dataloader = DataLoader(label_data, sampler=sampler, batch_size=16, collate_fn=label_padding_func) # label dataloader for regularization reg_sampler = RandomSampler(label_data) reg_dataloader = DataLoader(label_data, sampler=reg_sampler, batch_size=4, collate_fn=label_padding_func) if args.mode == 'ict': train_data = ICTXMCDataset(tokenizer=tokenizer, dataset=args.dataset) elif args.mode == 'self-train': train_data = PosDataset(tokenizer=tokenizer, dataset=args.dataset, labels=label_list, mode=args.mode) elif args.mode == 'finetune-pair': train_path = os.path.join(data_path, 'trn.json') pos_pair = [] with open(train_path) as fp: for i, line in enumerate(fp): inst = json.loads(line.strip()) inst_id = inst['uid'] for ind in inst['target_ind']: pos_pair.append((inst_id, ind, i)) dataset_size = len(pos_pair) indices = list(range(dataset_size)) split = int(np.floor(args.ratio * dataset_size)) np.random.shuffle(indices) train_indices = indices[:split] torch.distributed.broadcast_object_list(train_indices, src=0, group=None) sample_pairs = [pos_pair[i] for i in train_indices] train_data = PosDataset(tokenizer=tokenizer, dataset=args.dataset, labels=label_list, mode=args.mode, sample_pairs=sample_pairs) elif args.mode == 'finetune-label': label_index = [] label_path = os.path.join(data_path, 'label_index.json') with open(label_path) as fp: for line in fp: label_index.append(json.loads(line.strip())) np.random.shuffle(label_index) sample_size = int(np.floor(args.ratio * len(label_index))) sample_label = label_index[:sample_size] torch.distributed.broadcast_object_list(sample_label, src=0, group=None) sample_pairs = [] for i, label in enumerate(sample_label): ind = label['ind'] for inst_id in label['instance']: sample_pairs.append((inst_id, ind, i)) train_data = PosDataset(tokenizer=tokenizer, dataset=args.dataset, labels=label_list, mode=args.mode, sample_pairs=sample_pairs) train_sampler = RandomSampler(train_data) padding_func = lambda x: ICT_batchify(x, tokenizer.pad_token_id, 64, 288) train_dataloader = torch.utils.data.DataLoader( train_data, sampler=train_sampler, batch_size=args.per_device_train_batch_size, num_workers=4, pin_memory=False, collate_fn=padding_func) try: accelerator.print("load cache") all_instances = torch.load( os.path.join(data_path, 'all_passages_with_titles.json.cache.pt')) test_data = SimpleDataset(all_instances.values()) except: all_instances = {} test_path = os.path.join(data_path, 'tst.json') if args.mode == 'ict': train_path = os.path.join(data_path, 'trn.json') train_instances = {} valid_passage_ids = train_data.valid_passage_ids with open(train_path) as fp: for line in fp: inst = json.loads(line.strip()) train_instances[ inst['uid']] = inst['title'] + '\t' + inst['content'] for inst_id in valid_passage_ids: all_instances[inst_id] = train_instances[inst_id] test_ids = [] with open(test_path) as fp: for line in fp: inst = json.loads(line.strip()) all_instances[ inst['uid']] = inst['title'] + '\t' + inst['content'] test_ids.append(inst['uid']) simple_transform = lambda x: tokenizer.encode( x, max_length=288, truncation=True) test_data = SimpleDataset(list(all_instances.values()), transform=simple_transform) inst_num = len(test_data) sampler = SequentialSampler(test_data) sent_padding_func = lambda x: padding_util(x, tokenizer.pad_token_id, 288) instance_dataloader = DataLoader(test_data, sampler=sampler, batch_size=128, collate_fn=sent_padding_func) # prepare pairs reader = csv.reader(open(os.path.join(data_path, 'all_pairs.txt'), encoding="utf-8"), delimiter=" ") qrels = {} for id, row in enumerate(reader): query_id, corpus_id, score = row[0], row[1], int(row[2]) if query_id not in qrels: qrels[query_id] = {corpus_id: score} else: qrels[query_id][corpus_id] = score logging.info("| |ICT_dataset|={} pairs.".format(len(train_data))) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, label_dataloader, reg_dataloader, instance_dataloader = accelerator.prepare( model, optimizer, train_dataloader, label_dataloader, reg_dataloader, instance_dataloader) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) # args.max_train_steps = 100000 args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) args.num_warmup_steps = int(0.1 * args.max_train_steps) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_data)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Learning Rate = {args.learning_rate}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 from torch.cuda.amp import autocast scaler = torch.cuda.amp.GradScaler() cluster_result = eval_and_cluster(args, logger, completed_steps, accelerator.unwrap_model(model), label_dataloader, label_ids, instance_dataloader, inst_num, test_ids, qrels, accelerator) reg_iter = iter(reg_dataloader) trial_name = f"dim-{args.proj_emb_dim}-bs-{args.per_device_train_batch_size}-{args.dataset}-{args.log}-{args.mode}" for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): batch = tuple(t for t in batch) label_tokens, inst_tokens, indices = batch if args.mode == 'ict': try: reg_data = next(reg_iter) except StopIteration: reg_iter = iter(reg_dataloader) reg_data = next(reg_iter) if cluster_result is not None: pseudo_labels = cluster_result[indices] else: pseudo_labels = indices with autocast(): if args.mode == 'ict': label_emb, inst_emb, inst_emb_aug, reg_emb = model( label_tokens, inst_tokens, reg_data) loss, stats_dict = loss_function_reg( label_emb, inst_emb, inst_emb_aug, reg_emb, pseudo_labels, accelerator) else: label_emb, inst_emb = model(label_tokens, inst_tokens, reg_data=None) loss, stats_dict = loss_function(label_emb, inst_emb, pseudo_labels, accelerator) loss = loss / args.gradient_accumulation_steps scaler.scale(loss).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 1) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: scaler.step(optimizer) scaler.update() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps % args.logging_steps == 0: if args.mode == 'ict': logger.info( "| Epoch [{:4d}/{:4d}] Step [{:8d}/{:8d}] Total Loss {:.6e} Contrast Loss {:.6e} Reg Loss {:.6e}" .format( epoch, args.num_train_epochs, completed_steps, args.max_train_steps, stats_dict["loss"].item(), stats_dict["contrast_loss"].item(), stats_dict["reg_loss"].item(), )) else: logger.info( "| Epoch [{:4d}/{:4d}] Step [{:8d}/{:8d}] Total Loss {:.6e}" .format( epoch, args.num_train_epochs, completed_steps, args.max_train_steps, stats_dict["loss"].item(), )) if completed_steps % args.eval_steps == 0: cluster_result = eval_and_cluster( args, logger, completed_steps, accelerator.unwrap_model(model), label_dataloader, label_ids, instance_dataloader, inst_num, test_ids, qrels, accelerator) unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.label_encoder.save( f"{args.output_dir}/{trial_name}/label_encoder") unwrapped_model.instance_encoder.save( f"{args.output_dir}/{trial_name}/instance_encoder") if completed_steps >= args.max_train_steps: break
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, 'optimizer.pt')) and os.path.isfile( os.path.join(args.model_name_or_path, 'scheduler.pt')): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, 'optimizer.pt'))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, 'scheduler.pt'))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.output_dir) and not args.overwrite_output_dir: # set global_step to gobal_step of last saved checkpoint from model path global_step = int(args.output_dir.split('-')[-1].split('/')[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.input_type == 'para_sent': inputs['input_ids'] = { "input_id": batch[0], 'para_pos': batch[4], 'sent_pos': batch[5] } elif args.input_type == 'para_sent_token': inputs['input_ids'] = { "input_id": batch[0], 'para_pos': batch[4], 'sent_pos': batch[5], 'token_pos': batch[6] } elif args.input_type == 'sent_token': inputs['input_ids'] = { "input_id": batch[0], 'token_pos': batch[5], 'sent_pos': batch[4] } elif args.input_type == 'para_token': inputs['input_ids'] = { "input_id": batch[0], 'token_pos': batch[5], 'sent_pos': batch[4] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet' ] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): eval_key = 'eval_{}'.format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs['learning_rate'] = learning_rate_scalar logs['loss'] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) print(json.dumps({**logs, **{'step': global_step}})) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt')) torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt')) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.save_steps < 0 and args.local_rank in [-1, 0]: # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt')) torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt')) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def main(): # Parse the arguments args = parse_args() # Initialize the accelerator. We will let the accelerator handle device placement for us in this example. accelerator = Accelerator() # Make one log on every process with the configuration for debugging. logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info(accelerator.state) # Setup logging, we only want one process per machine to log things on the screen. # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel( logging.INFO if accelerator.is_local_main_process else logging.ERROR) if accelerator.is_local_main_process: datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() else: datasets.utils.logging.set_verbosity_error() transformers.utils.logging.set_verbosity_error() # If passed along, set the training seed now. if args.seed is not None: set_seed(args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name) else: data_files = {} if args.train_file is not None: data_files["train"] = args.train_file if args.validation_file is not None: data_files["validation"] = args.validation_file extension = args.train_file.split(".")[-1] raw_datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if args.config_name: config = AutoConfig.from_pretrained(args.model_name_or_path) elif args.model_name_or_path: config = AutoConfig.from_pretrained(args.model_name_or_path) else: config = CONFIG_MAPPING[args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name, use_fast=not args.use_slow_tokenizer) elif args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, use_fast=not args.use_slow_tokenizer) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) if args.model_name_or_path: model = AutoModelForSeq2SeqLM.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, ) else: logger.info("Training new model from scratch") model = AutoModelForSeq2SeqLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # Set decoder_start_token_id if model.config.decoder_start_token_id is None and isinstance( tokenizer, (MBartTokenizer, MBartTokenizerFast)): assert (args.target_lang is not None and args.source_lang is not None), "mBart requires --target_lang and --source_lang" if isinstance(tokenizer, MBartTokenizer): model.config.decoder_start_token_id = tokenizer.lang_code_to_id[ args.target_lang] else: model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids( args.target_lang) if model.config.decoder_start_token_id is None: raise ValueError( "Make sure that `config.decoder_start_token_id` is correctly defined" ) prefix = args.source_prefix if args.source_prefix is not None else "" # Preprocessing the datasets. # First we tokenize all the texts. column_names = raw_datasets["train"].column_names # For translation we set the codes of our source and target languages (only useful for mBART, the others will # ignore those attributes). if isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): if args.source_lang is not None: tokenizer.src_lang = args.source_lang if args.target_lang is not None: tokenizer.tgt_lang = args.target_lang # Get the language codes for input/target. source_lang = args.source_lang.split("_")[0] target_lang = args.target_lang.split("_")[0] padding = "max_length" if args.pad_to_max_length else False # Temporarily set max_target_length for training. max_target_length = args.max_target_length padding = "max_length" if args.pad_to_max_length else False def preprocess_function(examples): inputs = [ex[source_lang] for ex in examples["translation"]] targets = [ex[target_lang] for ex in examples["translation"]] inputs = [prefix + inp for inp in inputs] model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True) # Setup the tokenizer for targets with tokenizer.as_target_tokenizer(): labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore # padding in the loss. if padding == "max_length" and args.ignore_pad_token_for_loss: labels["input_ids"] = [[ (l if l != tokenizer.pad_token_id else -100) for l in label ] for label in labels["input_ids"]] model_inputs["labels"] = labels["input_ids"] return model_inputs processed_datasets = raw_datasets.map( preprocess_function, batched=True, num_proc=args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not args.overwrite_cache, desc="Running tokenizer on dataset", ) train_dataset = processed_datasets["train"] eval_dataset = processed_datasets["validation"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # DataLoaders creation: label_pad_token_id = -100 if args.ignore_pad_token_for_loss else tokenizer.pad_token_id if args.pad_to_max_length: # If padding was already done ot max length, we use the default data collator that will just convert everything # to tensors. data_collator = default_data_collator else: # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). data_collator = DataCollatorForSeq2Seq( tokenizer, model=model, label_pad_token_id=label_pad_token_id, pad_to_multiple_of=8 if accelerator.use_fp16 else None, ) train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size) eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size) # Optimizer # Split weights in two groups, one with weight decay and the other not. no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Prepare everything with our `accelerator`. model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare( model, optimizer, train_dataloader, eval_dataloader) # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be # shorter in multiprocess) # Scheduler and math around the number of training steps. num_update_steps_per_epoch = math.ceil( len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch else: args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps, num_training_steps=args.max_train_steps, ) metric = load_metric("sacrebleu") def postprocess_text(preds, labels): preds = [pred.strip() for pred in preds] labels = [[label.strip()] for label in labels] return preds, labels # Train! total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {args.per_device_train_batch_size}" ) logger.info( f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}" ) logger.info( f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 for epoch in range(args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): outputs = model(**batch) loss = outputs.loss loss = loss / args.gradient_accumulation_steps accelerator.backward(loss) if step % args.gradient_accumulation_steps == 0 or step == len( train_dataloader) - 1: optimizer.step() lr_scheduler.step() optimizer.zero_grad() progress_bar.update(1) completed_steps += 1 if completed_steps >= args.max_train_steps: break model.eval() if args.val_max_target_length is None: args.val_max_target_length = args.max_target_length gen_kwargs = { "max_length": args.val_max_target_length if args is not None else config.max_length, "num_beams": args.num_beams, } for step, batch in enumerate(eval_dataloader): with torch.no_grad(): generated_tokens = accelerator.unwrap_model(model).generate( batch["input_ids"], attention_mask=batch["attention_mask"], **gen_kwargs, ) generated_tokens = accelerator.pad_across_processes( generated_tokens, dim=1, pad_index=tokenizer.pad_token_id) labels = batch["labels"] if not args.pad_to_max_length: # If we did not pad to max length, we need to pad the labels too labels = accelerator.pad_across_processes( batch["labels"], dim=1, pad_index=tokenizer.pad_token_id) generated_tokens = accelerator.gather( generated_tokens).cpu().numpy() labels = accelerator.gather(labels).cpu().numpy() if args.ignore_pad_token_for_loss: # Replace -100 in the labels as we can't decode them. labels = np.where(labels != -100, labels, tokenizer.pad_token_id) decoded_preds = tokenizer.batch_decode( generated_tokens, skip_special_tokens=True) decoded_labels = tokenizer.batch_decode( labels, skip_special_tokens=True) decoded_preds, decoded_labels = postprocess_text( decoded_preds, decoded_labels) metric.add_batch(predictions=decoded_preds, references=decoded_labels) eval_metric = metric.compute() logger.info({"bleu": eval_metric["score"]}) if args.output_dir is not None: accelerator.wait_for_everyone() unwrapped_model = accelerator.unwrap_model(model) unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler_total = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) subset_quantity = args.div_subset # notice 难度划分 curriculum_sets_temp = [] # done 如何保证课程被采样了 diff_eval_result = Difficulty_Evaluation(args, train_dataset) for i, subset in enumerate(diff_eval_result): gate = int( (len(train_dataset) / args.train_batch_size) / (subset_quantity)) print("第", i, "个 num:", len(subset), " 阈值 ", gate) random.shuffle(subset) # 如果subset过于小,就不采样了 if len(subset) > gate: # subset = list(subset) # 决定没一个采样的长度 curriculum_sets_temp.append(subset[0:int(gate / subset_quantity)]) # elif(len(subset) <= int(gate/subset_quantity)): # for i in range(subset_quantity): # curriculum_sets_temp.append(subset) else: curriculum_sets_temp.append(subset) # curriculum_sets_temp.append(subset) # 不采样的 # diff_eval_result = Difficulty_Evaluation(args, train_dataset) # for _ in range(int(args.num_train_epochs)): # for i, subset in enumerate(diff_eval_result): # random.shuffle(subset) # curriculum_sets_temp.append(subset) # 随机划分 # curriculum_sets_temp = Difficulty_Evaluation_Randomly(args,train_dataset) # 先添加全部任务 curriculum_sets = [] total_train_dataloader = DataLoader(train_dataset, sampler=train_sampler_total, batch_size=args.train_batch_size) for i in range(int(args.num_train_epochs)): curriculum_sets.append(total_train_dataloader) # 再添加课程任务 # notice 课程任务顺序 curriculum_sets += curriculum_sets_temp # CL阶段训练 if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(curriculum_sets[0]) // args.gradient_accumulation_steps) + 1 else: t_total = len( curriculum_sets[0] ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] # notice 添加L2正则化 optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon, weight_decay=0.01) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(curriculum_sets[0])) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(curriculum_sets[0]) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(curriculum_sets[0]) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( # epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] epochs_trained, int(len(curriculum_sets)), desc="Epoch", disable=args.local_rank not in [-1, 0]) # Added here for reproductibility set_seed(args) current_stage = 0 for _ in train_iterator: epoch_iterator = tqdm(curriculum_sets[current_stage], desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) # print("batch_size",batch[0].shape) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert", "bart", "longformer" ]: del inputs["token_type_ids"] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: inputs.update({"is_impossible": batch[7]}) if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] # # notice 添加KL的loss 或者 wgan的那个w # pa = 0.0001 # for i in range(args.train_batch_size): # loss += ((pa)* # ((cal_diff(x=outputs.hidden_states[0], y=outputs.hidden_states[-1], norm="line",criterion="kl")+ # cal_diff(x=outputs.hidden_states[-1], y=outputs.hidden_states[0], norm="line", criterion="kl") # )/2) # ) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss # Save model checkpoint if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break current_stage += 1 if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
class Experiment(object): def __init__(self, config, model, tokenizer, total_samples=None, label_names=None, results=None, run_name=None): self.config = config self.model = model self.tokenizer = tokenizer self.global_step = 0 self.optimizer_state_dict = None self.scheduler_state_dict = None self.total_samples = total_samples self.label_names = label_names self.results = results self.run_name = run_name util.set_seed(config) __prfs_names = ['precision', 'recall', 'f1', 'support'] __report_metrics = [ 'acc', 'macro_f1', 'micro_f1', 'macro_auc', 'avg_precision' ] def after_eval_cb(self, eval_name, result, pred_label_ids, preds, extra_log): row = OrderedDict(step=self.global_step, eval_name=eval_name, run_name=self.run_name) row.update(extra_log) for key in self.__report_metrics: if key in result: row[key] = result[key] prfs = result['prfs'] for metric_idx, metric_name in enumerate(self.__prfs_names): for label_idx, label_name in enumerate(self.label_names): col_name = f"{label_name}_{metric_name}" row[col_name] = result['prfs'][metric_idx][label_idx] if self.config.seeds: row['seed'] = self.config.seed if self.results is None: logger.warning("Creating new results DataFrame") self.results = pd.DataFrame(row, columns=row.keys(), index=[0]) else: logger.debug("Adding row: %s", row) self.results = self.results.append(row, ignore_index=True) if self.config.get('out_file', None): self.results.to_csv(self.config.out_file, index=False) # results = self.results # key = self.run_name # if key not in results: # results[key] = {} # if eval_name not in results[key]: # results[key][eval_name] = {} # results[key][eval_name][self.global_step] = result # with open(self.config.out_file, 'w') as f: # json.dump(results, f, indent=4, cls=util.ExtendedJSONEncoder) def after_logging(self, result): pass def train(self, train_dataloader, valid_dataloader=None, test_dataloader=None, should_continue=False): """ Train the model """ tb_writer = SummaryWriter() train_epochs = self.config.train_epochs if self.config.max_steps > 0: train_steps = self.config.max_steps train_epochs = self.config.max_steps // ( len(train_dataloader) // self.config.grad_acc_steps) + 1 else: train_steps = len( train_dataloader) // self.config.grad_acc_steps * train_epochs if self.total_samples and should_continue: steps_total = self.total_samples // self.config.train_bs // self.config.grad_acc_steps * train_epochs else: steps_total = train_steps # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.config.weight_decay, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] self.optimizer = AdamW( optimizer_grouped_parameters, lr=self.config.lr, eps=self.config.adam_eps, ) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=self.config.warmup_steps, num_training_steps=steps_total) # self.scheduler = get_constant_schedule(self.optimizer) if should_continue and self.global_step > 0: logger.info("loading saved optimizer and scheduler states") assert (self.optimizer_state_dict) assert (self.scheduler_state_dict) self.optimizer.load_state_dict(self.optimizer_state_dict) self.scheduler.load_state_dict(self.scheduler_state_dict) else: logger.info("Using fresh optimizer and scheduler") if self.config.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) self.model, self.optimizer = amp.initialize( self.model, self.optimizer, opt_level=self.config.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if self.config.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): self.model = torch.nn.DataParallel(self.model) logger.info("***** Running training *****") logger.info(" Num examples = %d (%d)", len(train_dataloader.dataset), len(train_dataloader)) logger.info(" Num Epochs = %d", train_epochs) logger.info(" Batch size = %d", self.config.train_bs) logger.info(" Learning rate = %e", self.config.lr) logger.info(" Loss label weights = %s", self.config.loss_label_weights) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", self.config.train_bs * self.config.grad_acc_steps) logger.info(" Gradient Accumulation steps = %d", self.config.grad_acc_steps) logger.info(" Total optimization steps = %d", train_steps) if not should_continue: self.global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # # Check if continuing training from a checkpoint # if os.path.exists(self.config.model_path): # if self.config.should_continue: # step_str = self.config.model_path.split("-")[-1].split("/")[0] # if step_str: # # set self.global_step to gobal_step of last saved checkpoint from model path # self.global_step = int(step_str) # epochs_trained = self.global_step // (len(train_dataloader) // # self.config.grad_acc_steps) # steps_trained_in_current_epoch = self.global_step % ( # len(train_dataloader) // self.config.grad_acc_steps) # logger.info( # " Continuing training from checkpoint, will skip to saved self.global_step") # logger.info( # " Continuing training from epoch %d", epochs_trained) # logger.info( # " Continuing training from global step %d", self.global_step) # logger.info(" Will skip the first %d steps in the first epoch", # steps_trained_in_current_epoch) train_loss = 0.0 self.model.zero_grad() train_iterator = trange( epochs_trained, int(train_epochs), desc="Epoch", ) util.set_seed(self.config) # Added here for reproductibility self.model.train() if self.config.train_head_only: for param in self.model.roberta.embeddings.parameters(): param.requires_grad = False logger.info("Training only head") # for param in self.model.__getattr__(self.config.model_type).roberta.parameters(): # param.requires_grad = False for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue self.model.train() inputs = self.__inputs_from_batch(batch) outputs = self.model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if self.config.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if self.config.grad_acc_steps > 1: loss = loss / self.config.grad_acc_steps if self.config.fp16: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() batch_loss = loss.item() train_loss += batch_loss if (step + 1) % self.config.grad_acc_steps == 0: if self.config.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(self.optimizer), self.config.max_grad_norm) else: torch.nn.utils.clip_grad_norm_( self.model.parameters(), self.config.max_grad_norm) self.optimizer.step() self.scheduler.step() # Update learning rate schedule self.model.zero_grad() self.global_step += 1 if self.config.logging_steps > 0 and self.global_step % self.config.logging_steps == 0: logs = {} if valid_dataloader: result_valid, * \ _ = self.evaluate( 'valid', valid_dataloader, backtrans=(test_dataloader == None)) logs.update({ f"valid_{k}": v for k, v in result_valid.items() }) if test_dataloader: test_dataloader = test_dataloader if isinstance( test_dataloader, dict) else { 'test': test_dataloader } for eval_name, dataloader_or_tuple in test_dataloader.items( ): if isinstance(dataloader_or_tuple, tuple): dataloader, kwargs = dataloader_or_tuple else: dataloader = dataloader_or_tuple kwargs = {} result_test, * \ _ = self.evaluate( eval_name, dataloader, **kwargs) logs.update({ f"{eval_name}_{k}": v for k, v in result_test.items() }) learning_rate_scalar = self.scheduler.get_last_lr()[0] logger.info("Learning rate: %f (at step %d)", learning_rate_scalar, step) logs["learning_rate"] = learning_rate_scalar logs["train_loss"] = train_loss self.after_logging(logs) logger.info("Batch loss: %f", batch_loss) # for key, value in logs.items(): # tb_writer.add_scalar(key, value, self.global_step) if self.config.save_steps > 0 and self.global_step % self.config.save_steps == 0: # Save model checkpoint self.save_checkpoint() if self.config.max_steps > 0 and self.global_step > self.config.max_steps: epoch_iterator.close() break if self.config.max_steps > 0 and self.global_step > self.config.max_steps: train_iterator.close() break if self.config.train_head_only: logger.info("Training only head") # for param in self.model.__getattr__(self.config.model_type).parameters(): # param.requires_grad = True for param in self.model.roberta.embeddings.parameters(): param.requires_grad = False tb_writer.close() self.optimizer_state_dict = self.optimizer.state_dict() self.scheduler_state_dict = self.scheduler.state_dict() avg_train_loss = train_loss / self.global_step logger.info("Learning rate now: %s", self.scheduler.get_last_lr()) logger.info("***** Done training *****") return self.global_step, avg_train_loss def save_model(self, model_path): if not os.path.exists(model_path): os.makedirs(model_path) logger.info("Saving model to %s", model_path) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (self.model.module if hasattr(self.model, "module") else self.model) model_to_save.save_pretrained(model_path) self.tokenizer.save_pretrained(model_path) # Good practice: save your training arguments together with the trained model torch.save(self.config.as_dict(), os.path.join(model_path, "training_config.bin")) def save_checkpoint(self): output_dir = os.path.join(self.config.output_model_path, "checkpoint-{}".format(self.global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( self.model.module if hasattr(self.model, "module") else self.model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) self.tokenizer.save_pretrained(output_dir) torch.save(self.config.as_dict(), os.path.join(output_dir, "training_self.config.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(self.optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(self.scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) def predict(self, dataloader): self.model.eval() preds = None for batch in tqdm(dataloader, desc="Predicting"): batch = tuple(t.to(self.config.device) for t in batch) input_ids, attention_mask, _ = batch with torch.no_grad(): inputs = { "input_ids": input_ids, "attention_mask": attention_mask } # if config.model_type != "distilbert": # inputs["token_type_ids"] = ( # batch[2] if config.model_type in [ # "bert", "xlnet", "albert"] else None # ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids outputs = self.model(**inputs) logits = outputs[0] if preds is None: preds = logits.detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) return preds def logits_to_label_ids(self, logits): if not self.config.multi_label: label_ids = np.argmax(logits, axis=1) else: label_ids = F.sigmoid(torch.from_numpy(logits)).numpy() > 0.5 return label_ids def evaluate(self, eval_name, dataloader, mc_dropout=False, skip_cb=False, pred_label_ids_func=None, backtrans=True, extra_log={}): dropout_ps = {} def set_dropout_to_train(m): if type(m) == nn.Dropout: logger.info("setting dropout into train mode (%s)", str(m)) logger.info("setting dropout into train mode (%s)", str(m)) m.p = 0.5 m.train() def reset_dropout_to_eval(m): if type(m) == nn.Dropout: p = dropout_ps[m] logger.info("reseting dropout into eval mode (%s) p=%d", str(m), p) m.p = p m.eval() # Eval! logger.info("***** Running evaluation %s*****", eval_name) logger.info(" Num examples = %d", len(dataloader.dataset)) logger.info(" Batch size = %d", self.config.eval_bs) eval_loss = 0.0 nb_eval_steps = 0 preds = None true_label_ids = None self.model.eval() if mc_dropout: self.model.apply(set_dropout_to_train) for batch in tqdm(dataloader, desc="Evaluating"): with torch.no_grad(): inputs = self.__inputs_from_batch(batch) labels = inputs['labels'] outputs = self.model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() true_label_ids = labels.detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) true_label_ids = np.append(true_label_ids, labels.detach().cpu().numpy(), axis=0) if mc_dropout: self.model.apply(reset_dropout_to_eval) eval_loss = eval_loss / nb_eval_steps if self.config.test_backtrans_langs and backtrans: logger.info('Using test augmentation...') groups = np.split(preds, len(self.config.test_backtrans_langs) + 1) #preds = sum(groups) preds = np.mean(groups, axis=0) #preds = np.maximum.reduce(groups) true_label_ids = true_label_ids[:preds.shape[0]] label_idxs = list(range(len(self.label_names))) if self.config.soft_label: true_label_ids = np.argmax(true_label_ids, axis=1) pred_label_ids = self.logits_to_label_ids(preds) if pred_label_ids_func: pred_label_ids = pred_label_ids_func(pred_label_ids) # print(out_label_ids) # print(max_preds) # print(out_label_ids.shape, max_preds.shape) result = { 'acc': accuracy_score(true_label_ids, pred_label_ids), 'macro_f1': f1_score(true_label_ids, pred_label_ids, average='macro'), 'micro_f1': f1_score(true_label_ids, pred_label_ids, average='micro'), 'prfs': precision_recall_fscore_support(true_label_ids, pred_label_ids, labels=label_idxs) } if not self.config.multi_label: result['cm'] = confusion_matrix(true_label_ids, pred_label_ids).ravel() if self.config.num_labels == 2: result['macro_auc'] = roc_auc_score(true_label_ids, pred_label_ids, average='macro') result['avg_precision'] = average_precision_score( true_label_ids, pred_label_ids) logger.info("***** Eval results {} *****".format(eval_name)) try: logger.info( "\n %s", classification_report( true_label_ids, pred_label_ids, labels=label_idxs, target_names=self.label_names, )) result['report'] = classification_report( true_label_ids, pred_label_ids, labels=label_idxs, target_names=self.label_names, output_dict=True) except ValueError as e: print(e) pass logger.info("\n Accuracy = %f", result['acc']) if self.config.num_labels == 2: logger.info("\n MacroAUC = %f", result['macro_auc']) logger.info("\n AUPRC = %f", result['avg_precision']) logger.info("***** Done evaluation *****") if not skip_cb: self.after_eval_cb(eval_name, result, pred_label_ids, preds, extra_log) return result, pred_label_ids, preds def __inputs_from_batch(self, batch, labels=True): batch = tuple(t.to(self.config.device) for t in batch) input_ids, attention_mask, label_ids, *rest = batch if rest: extra_features = rest[0] else: extra_features = None inputs = { "input_ids": input_ids, "attention_mask": attention_mask, "extra_features": extra_features } if labels: inputs["labels"] = label_ids # if self.config.model_type != "distilbert": # inputs["token_type_ids"] = ( # batch[2] if self.config.model_type in [ # "bert", "xlnet", "albert"] else None # ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids # outputs = model(b_input_ids, token_type_ids=None, # attention_mask=b_input_mask, labels=b_labels) return inputs def interpret(self, dataloader, df, label_names=None): dataset = dataloader.dataset sampler = SequentialSampler(dataset) # We need a sequential dataloader with bs=1 dataloader = DataLoader(dataset, sampler=sampler, batch_size=1, num_workers=4) logger.info("***** Running interpretation *****") logger.info(" Num examples = %d", len(dataset)) # preds = None losses = None pred_labels = [] self.model.eval() for batch in tqdm(dataloader, desc="Interpretation"): with torch.no_grad(): inputs = self.__inputs_from_batch(batch) # if config.model_type != "distilbert": # inputs["token_type_ids"] = ( # batch[2] if config.model_type in [ # "bert", "xlnet", "albert"] else None # ) # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids outputs = self.model(**inputs) batch_loss, logits = outputs[:2] if self.config.n_gpu > 1: batch_loss = batch_loss.mean( ) # mean() to average on multi-gpu parallel training batch_loss = batch_loss.detach().cpu().view(1) pred_label_ids = self.logits_to_label_ids( logits.detach().cpu()) pred_label_id = pred_label_ids[0] if label_names: pred_labels.append(label_names[pred_label_id]) else: pred_labels.append(pred_label_id) if losses is None: # preds = logits.detach().cpu().numpy() losses = batch_loss else: # preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) losses = torch.cat((losses, batch_loss), dim=0) top_values, top_indices = torch.topk(losses, 100) top_indices = top_indices.numpy() top_pred_labels = [pred_labels[top_index] for top_index in top_indices] top_df = df.iloc[top_indices] top_df = top_df.assign(loss=top_values.numpy(), pred_label=top_pred_labels) return top_df
"bert-base-uncased", num_labels=len(label_dict), output_attentions=False, output_hidden_states=False) batch_size = 3 dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=batch_size) dataloader_validation = DataLoader(dataset_val, sampler=SequentialSampler(dataset_val), batch_size=batch_size) optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8) epochs = 5 scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train) * epochs) def f1_score_func(preds, labels): preds_flat = np.argmax(preds, axis=1).flatten() labels_flat = labels.flatten() return f1_score(labels_flat, preds_flat, average='weighted')
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--data_dir", default=None, type=str, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--teacher_model", default=None, type=str, help="The teacher model dir.") parser.add_argument("--student_model", default=None, type=str, help="The student model dir.") parser.add_argument("--task_name", default="SST-2", type=str, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--weight_decay', '--wd', default=1e-4, type=float, metavar='W', help='weight decay') parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) # added arguments parser.add_argument('--aug_train', action='store_true') parser.add_argument('--eval_step', type=float, default=0.1) parser.add_argument('--pred_distill', action='store_true') parser.add_argument('--data_url', type=str, default="") parser.add_argument('--temperature', type=float, default=1.) args = parser.parse_args() logger.info('The args: {}'.format(args)) # intermediate distillation default parameters default_params = { "cola": { "num_train_epochs": 50, "max_seq_length": 64 }, "mnli": { "num_train_epochs": 5, "max_seq_length": 128 }, "mrpc": { "num_train_epochs": 20, "max_seq_length": 128 }, "sst-2": { "num_train_epochs": 10, "max_seq_length": 64 }, "sts-b": { "num_train_epochs": 20, "max_seq_length": 128 }, "qqp": { "num_train_epochs": 5, "max_seq_length": 128 }, "qnli": { "num_train_epochs": 10, "max_seq_length": 128 }, "rte": { "num_train_epochs": 20, "max_seq_length": 128 } } acc_tasks = ["mnli", "mrpc", "sst-2", "qqp", "qnli", "rte"] corr_tasks = ["sts-b"] mcc_tasks = ["cola"] # Prepare devices device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger.info("device: {} n_gpu: {}".format(device, n_gpu)) # Prepare seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # Prepare task settings if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name in default_params: args.max_seq_len = default_params[task_name]["max_seq_length"] if not args.pred_distill and not args.do_eval: if task_name in default_params: args.num_train_epoch = default_params[task_name][ "num_train_epochs"] if task_name not in processors: raise ValueError("Task not found: %s" % task_name) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.student_model, do_lower_case=args.do_lower_case) student_config = BertConfig.from_pretrained(args.student_model, num_labels=num_labels, finetuning_task=args.task_name) if not args.do_eval: if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps train_data, _ = get_tensor_data(args, task_name, tokenizer, False, args.aug_train) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) num_train_optimization_steps = int( len(train_dataloader) / args.gradient_accumulation_steps) * args.num_train_epochs eval_data, eval_labels = get_tensor_data(args, task_name, tokenizer, True, False) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) if not args.do_eval: teacher_config = BertConfig.from_pretrained( args.teacher_model, num_labels=num_labels, finetuning_task=args.task_name) teacher_model = TinyBertForSequenceClassification.from_pretrained( args.teacher_model, config=teacher_config) teacher_model.to(device) student_model = TinyBertForSequenceClassification.from_pretrained( args.student_model, config=student_config) student_model.to(device) if args.do_eval: logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_data)) logger.info(" Batch size = %d", args.eval_batch_size) student_model.eval() result = do_eval(student_model, task_name, eval_dataloader, device, output_mode, eval_labels, num_labels) logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) else: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if n_gpu > 1: student_model = torch.nn.DataParallel(student_model) teacher_model = torch.nn.DataParallel(teacher_model) # Prepare optimizer param_optimizer = list(student_model.named_parameters()) size = 0 for n, p in student_model.named_parameters(): logger.info('n: {}'.format(n)) size += p.nelement() logger.info('Total parameters: {}'.format(size)) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(num_train_optimization_steps * args.warmup_proportion), num_training_steps=num_train_optimization_steps) if not args.pred_distill: scheduler = get_constant_schedule(optimizer) # Prepare loss functions loss_mse = MSELoss() def soft_cross_entropy(predicts, targets): student_likelihood = torch.nn.functional.log_softmax(predicts, dim=-1) targets_prob = torch.nn.functional.softmax(targets, dim=-1) return (-targets_prob * student_likelihood).mean() # Train and evaluate global_step = 0 best_dev_acc = 0.0 output_eval_file = os.path.join(args.output_dir, "eval_results.txt") for epoch_ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0. tr_att_loss = 0. tr_rep_loss = 0. tr_cls_loss = 0. student_model.train() nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", ascii=True)): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, seq_lengths = batch if input_ids.size()[0] != args.train_batch_size: continue att_loss = 0. rep_loss = 0. cls_loss = 0. student_logits, student_atts, student_reps = student_model( input_ids, segment_ids, input_mask, is_student=True) with torch.no_grad(): teacher_logits, teacher_atts, teacher_reps = teacher_model( input_ids, segment_ids, input_mask) if not args.pred_distill: teacher_layer_num = len(teacher_atts) student_layer_num = len(student_atts) # print("teacher_layer_num:",teacher_layer_num) # print("student_layer_num:",student_layer_num) # print("teacher_reps num:",len(teacher_reps)) assert teacher_layer_num % student_layer_num == 0 layers_per_block = int(teacher_layer_num / student_layer_num) new_teacher_atts = [ teacher_atts[i * layers_per_block + layers_per_block - 1] for i in range(student_layer_num) ] for student_att, teacher_att in zip( student_atts, new_teacher_atts): student_att = torch.where( student_att <= -1e2, torch.zeros_like(student_att).to(device), student_att) teacher_att = torch.where( teacher_att <= -1e2, torch.zeros_like(teacher_att).to(device), teacher_att) tmp_loss = loss_mse(student_att, teacher_att) att_loss += tmp_loss new_teacher_reps = [ teacher_reps[i * layers_per_block] for i in range(student_layer_num + 1) ] new_student_reps = student_reps for student_rep, teacher_rep in zip( new_student_reps, new_teacher_reps): tmp_loss = loss_mse(student_rep, teacher_rep) rep_loss += tmp_loss loss = rep_loss + att_loss tr_att_loss += att_loss.item() tr_rep_loss += rep_loss.item() else: if output_mode == "classification": cls_loss = soft_cross_entropy( student_logits / args.temperature, teacher_logits / args.temperature) elif output_mode == "regression": loss_mse = MSELoss() cls_loss = loss_mse(student_logits.view(-1), label_ids.view(-1)) loss = cls_loss tr_cls_loss += cls_loss.item() if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += label_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 if (global_step + 1) % int( args.eval_step * num_train_optimization_steps) == 0: logger.info("***** Running evaluation *****") logger.info(" Epoch = {} iter {} step".format( epoch_, global_step)) logger.info(" Num examples = %d", len(eval_data)) logger.info(" Batch size = %d", args.eval_batch_size) student_model.eval() loss = tr_loss / (step + 1) cls_loss = tr_cls_loss / (step + 1) att_loss = tr_att_loss / (step + 1) rep_loss = tr_rep_loss / (step + 1) result = {} if args.pred_distill: result = do_eval(student_model, task_name, eval_dataloader, device, output_mode, eval_labels, num_labels) result['global_step'] = global_step result['cls_loss'] = cls_loss result['att_loss'] = att_loss result['rep_loss'] = rep_loss result['loss'] = loss result_to_file(result, output_eval_file) if not args.pred_distill: save_model = True else: save_model = False if task_name in acc_tasks and result[ 'acc'] > best_dev_acc: best_dev_acc = result['acc'] save_model = True if task_name in corr_tasks and result[ 'corr'] > best_dev_acc: best_dev_acc = result['corr'] save_model = True if task_name in mcc_tasks and result[ 'mcc'] > best_dev_acc: best_dev_acc = result['mcc'] save_model = True if save_model: logger.info("***** Save model *****") model_to_save = student_model.module if hasattr( student_model, 'module') else student_model model_name = "pytorch_model.bin" # if not args.pred_distill: # model_name = "step_{}_{}".format(global_step, "pytorch_model.bin") output_model_file = os.path.join( args.output_dir, model_name) output_config_file = os.path.join( args.output_dir, "config.json") torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Test mnli-mm if args.pred_distill and task_name == "mnli": task_name = "mnli-mm" if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_data, eval_labels = get_tensor_data( args, task_name, tokenizer, True, False) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) logger.info("***** Running mm evaluation *****") logger.info(" Num examples = %d", len(eval_data)) logger.info(" Batch size = %d", args.eval_batch_size) result = do_eval(student_model, task_name, eval_dataloader, device, output_mode, eval_labels, num_labels) result['global_step'] = global_step tmp_output_eval_file = os.path.join( args.output_dir + '-MM', "eval_results.txt") result_to_file(result, tmp_output_eval_file) task_name = 'mnli' student_model.train()
def train(args): device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") set_seed(args) best_f1 = 0 logger.info("the current config is :\n {}".format(str(vars(args)))) if args.model_name in MODEL_MAP: Config, Model, Tokenizer, Transform = MODEL_MAP[args.model_name] config = Config.from_pretrained(args.pretrained_model_path, num_labels=args.num_labels) config = add_args_to_config(args, config) ##add customized args tokenizer = Tokenizer.from_pretrained(args.pretrained_model_path, do_lower_case=args.do_lower_case) model = load_model(Model, args, config) model = model.to(device) if args.n_gpus > 1: model = nn.DataParallel(model) ###adv training fgm = FGM(model) transform = Transform(tokenizer, args) train_data = Corpus(args, "train.csv", transform) ###get the weighted sample with the weight [0.9,0.2,0.5] # weight = [0.9,0.2,0.5] # weight_sequence = [] # for i in range(len(train_data)): # data = train_data[i] # label =data.get('label').item() # weight_sequence.append(weight[label]) ###add the weight of this label dev_data = Corpus(args, 'dev.csv', transform) dev_sampler = SequentialSampler(dev_data) dev_loader = DataLoader(dev_data, batch_size=args.eval_batch_size, sampler=dev_sampler) # Run prediction for full data eval_sampler = SequentialSampler(dev_data) dev_loader = DataLoader(dev_data, sampler=eval_sampler, batch_size=args.eval_batch_size) train_sampler = RandomSampler(train_data) # weight_sampler = WeightedRandomSampler(weights=weight_sequence,num_samples=args.epochs*len(train_data), replacement=True) test_sampler = SubsetRandomSampler( np.random.randint(low=0, high=(len(train_data)), size=len(dev_data))) train_loader = DataLoader(train_data, batch_size=args.batch_size, sampler=train_sampler, drop_last=True) test_loader = DataLoader(train_data, batch_size=args.eval_batch_size, sampler=test_sampler) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Batch size = %d", args.batch_size) logger.info(" Num steps = %d", args.epochs) logger.info(" Early Stopping dev_loss = %f", args.dev_loss) bar = tqdm(range(len(train_loader) * args.epochs), total=len(train_loader) * args.epochs) train_loader = cycle(train_loader) ##get optimizer param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used # thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=0, t_total=len(bar)) steps = 0 # dev_labels = dev_data.get_feature("label") # dev_labels = [i.item() for i in dev_labels]# get gold label total_train_loss = 0 for step in bar: model.train() data_batch = next(train_loader) for k, v in data_batch.items(): data_batch[k] = v.to(device) loss = model(batch=data_batch, feed_labels=True) if args.n_gpus > 1: loss = loss.mean() loss.backward() ###adv training fgm.attack() loss_adv = model(batch=data_batch, feed_labels=True) if args.n_gpus > 1: loss_adv = loss_adv.mean() loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 fgm.restore() # 恢复embedding参数 ###adv training optimizer.step() optimizer.zero_grad() scheduler.step() ##setting bar steps += 1 total_train_loss += loss.item() bar.set_description("training loss {}".format(loss.item())) if (steps) % args.eval_steps == 0: logits, loss, dev_labels = do_inference( model, dev_loader, device) test_logits, test_loss, test_labels = do_inference( model, test_loader, device) inference_labels = logits.argmax(axis=1) test_inference_labels = test_logits.argmax(axis=1) f1 = f1_score(dev_labels, inference_labels, labels=[0, 1, 2], average="macro") test_f1 = f1_score(test_labels, test_inference_labels, labels=[0, 1, 2], average="macro") # acc = accuracy_score(dev_labels, inference_labels) logger.info("=========eval report =========") logger.info("step : %s ", str(steps)) logger.info("average_train loss: %s" % (str(total_train_loss / steps))) logger.info("subset train loss: %s" % (str(test_loss))) logger.info("subset train f1 score: %s", str(test_f1)) logger.info("eval loss: %s", str(loss)) logger.info("eval f1 score: %s", str(f1)) output_eval_file = os.path.join(args.out_dir, "eval_records.txt") with open(output_eval_file, "a") as writer: if steps == args.eval_steps: writer.write("\n%s\n" % (args.memo)) writer.write("=========eval report =========\n") writer.write("step : %s \n" % (str(steps))) writer.write("average_train loss: %s\n" % (str(total_train_loss / steps))) writer.write("subset train loss: %s\n" % (str(test_loss))) writer.write("subset f1 score: %s\n" % (str(test_f1))) writer.write("eval loss: %s\n" % (str(loss))) writer.write("eval f1 score: %s\n" % (str(f1))) writer.write('\n') if f1 > best_f1: logger.info("we get a best dev f1 %s saving model....", str(f1)) output_path = os.path.join(args.out_dir, "pytorch_model.bin") if hasattr(model, 'module'): logger.info("model has module") model_to_save = model.module if hasattr( model, 'module') else model torch.save(model_to_save.state_dict(), output_path) logger.info("model saved") best_f1 = f1 save_config(args) logger.info("args saved") ##load the final model args.to_resume_model = True model = load_model(Model, args, config) model = model.to(device) if args.n_gpus > 1: model = nn.DataParallel(model) dev_logits, loss, dev_labels = do_inference( model, dev_loader, device) ##do the inference for dev set pub_data = Corpus(args, 'test.csv', transform) pub_sampler = SequentialSampler(pub_data) pub_loader = DataLoader(pub_data, batch_size=args.eval_batch_size, sampler=pub_sampler) # logits, loss, dev_labels = do_inference(model, dev_loader, device) test_logits, _, _ = do_inference(model, pub_loader, device) return dev_logits, dev_labels, test_logits else: logger.info("the model %s is not registered", args.model_name) return
def train(args, train_dataset, model, tokenizer): global extracted_grads """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() if args.mix_option == 1: logger.info("Random Mixup") else: logger.info("No Mixup") args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) processor = processors[args.task_name]() attacker = get_attacker(args.attacker) train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True, ) # Train! logger.info("***** Running training *****") logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0], ) set_seed(args) # Added here for reproductibility ## Add Mixup in Batch epoch = 0 for _ in train_iterator: epoch += 1 if epoch > 1 and args.iterative: ## augment the current train dataset with new batch of adversarial exampels generated by the currect model orig_data = load_custom_dataset(os.path.join( args.data_dir, "train.tsv"), all_data=True, number=args.num_adv) clsf = ModelClassifier(tokenizer, model, args) attack_eval = OpenAttack.attack_evals.DefaultAttackEval( attacker, clsf, progress_bar=True) adv_egs = attack_eval.eval(orig_data, visualize=False, return_examples=True) adv_examples = processor._create_examples(adv_egs, "adv_train") logger.info( "Epoch: {}, Number of adversarial examples added to training: {}" .format(epoch, len(adv_examples))) adv_dataset = convert_examples_dataset(args, adv_examples, tokenizer) train_dataset = ConcatDataset([train_dataset, adv_dataset]) ## start training on augmented data (we will shuffle the training data) # train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True) logger.info("Current Num examples = %d", len(train_dataset)) epoch_iterator = train_dataloader for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) ## normal training ## for now, just ignore token type ids input_ids = batch[0] #(bsz, len) attention_mask = batch[1] batch_size = input_ids.size(0) length = input_ids.size(1) labels = batch[3] #(bsz,) logits, outputs = model(input_ids, attention_mask) #(bsz, num_labels) # x_embeddings = outputs[2] # (bsz, len, dim) # x_embeddings.register_hook(save_grad("x_emb")) # logger.info("#outputs 1: " + str(len(outputs[-1]))) L_ori = nn.CrossEntropyLoss()(logits.view(-1, args.num_labels), labels.view(-1)) ## RandomMix if args.mix_option == 1: idx = torch.randperm(batch_size) input_ids_2 = input_ids[idx] labels_2 = labels[idx] attention_mask_2 = attention_mask[idx] ## convert the labels to one-hot labels = torch.zeros(batch_size, args.num_labels).to(args.device).scatter_( 1, labels.view(-1, 1), 1) labels_2 = torch.zeros(batch_size, args.num_labels).to( args.device).scatter_(1, labels_2.view(-1, 1), 1) l = np.random.beta(args.alpha, args.alpha) # l = max(l, 1-l) ## not needed when only using labeled examples mixed_labels = l * labels + (1 - l) * labels_2 mix_layer = np.random.choice(args.mix_layers_set, 1)[0] mix_layer = mix_layer - 1 logits, outputs = model(input_ids, attention_mask, input_ids_2, attention_mask_2, l, mix_layer) probs = torch.softmax(logits, dim=1) #(bsz, num_labels) L_mix = F.kl_div(probs.log(), mixed_labels, None, None, 'batchmean') loss = L_ori + L_mix else: loss = L_ori if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps tr_loss += loss.item() if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): eval_key = "eval_{}".format(key) logs[eval_key] = value loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss for key, value in logs.items(): tb_writer.add_scalar(key, value, global_step) # print(json.dumps({**logs, **{"step": global_step}})) logging.info("Global Step: " + str(global_step)) logging.info("Loss: " + str(loss_scalar)) if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break ## save the final epoch only if args.local_rank in [-1, 0]: # Save model checkpoint output_dir = os.path.join(args.output_dir, "final-checkpoint") if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
VALID_DATA_LOADER = create_dataloader(df=VALID, max_len=MAX_LEN, bs=BS) """Calling Model and sending to CUDA""" device = torch.device('cuda' if torch.cuda.is_available() else "cpu") model = Classifier() criterion = criterion criterion.to(device) model.to(device) """Otimizador e Scheduler""" optimizer = AdamW( model.parameters(), lr=float(config['model']['learning_rate']), correct_bias=False ) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=config['model']['num_warmup_steps'], num_training_steps=config['model']['num_epochs'] * config['model']['batch_size'] ) """Training Loop""" EPOCHS = config['model']['num_epochs'] with open("logger.txt", "w") as f: f.write(f"")
def train(args, train_dataset, model, tokenizer): """ Train the model """ # train_batch_size args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) # train_sampler train_sampler = RandomSampler(train_dataset) # train_dataloader train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: # 实际并不会用到max_steps,故实际要跑的所有step=t_total t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model, device_ids=range(args.n_gpu)) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps, ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) best_acc = 0.0 global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=False) set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=False) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet"] else None ) # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: logs = {} loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["loss"] = loss_scalar logging_loss = tr_loss print('\n') print(json.dumps({**logs, **{"step": global_step}})) print('\n') if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint # global_step是save_step的倍数时进行save checkpoint if (args.evaluate_during_training): # Only evaluate when single GPU otherwise metrics may not average well recent_acc = evaluate(args, model, tokenizer) output_dir = os.path.join( args.output_dir, 'checkpoint_{}'.format(args.task_name)) best_dir = os.path.join(args.output_dir, 'best_{}'.format(args.task_name)) if not os.path.exists(output_dir): os.makedirs(output_dir) if not os.path.exists(best_dir): os.makedirs(best_dir) is_best = recent_acc > best_acc best_acc = max(recent_acc, best_acc) logger.info('Recent EVAL ACC: {} BEST EVAL ACC: {}'.format( recent_acc, best_acc)) if is_best: model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(best_dir) tokenizer.save_pretrained(best_dir) torch.save(args, os.path.join(best_dir, "training_args.bin")) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break return global_step, tr_loss / global_step
def train_lm( data_dir: str, model_dir: str, dataset: str, baseline: str, hyper_params: Dict[str, Any], loss_type: str, compute_train_batch_size: int, predict_batch_size: int, gpu_ids: Optional[List[int]], logger: Optional[logging.Logger] = None ) -> None: """Fine-tune a pre-trained LM baseline on a scruples dataset. Fine-tune ``baseline`` on ``dataset``, writing all results and artifacts to ``model_dir``. Return the best calibrated xentropy achieved on dev after any epoch. Parameters ---------- data_dir : str The path to the directory containing the dataset. model_dir : str The path to the directory in which to save results. dataset : str The dataset to use when fine-tuning ``baseline``. Must be either "resource" or "corpus". baseline : str The pre-trained LM to fine-tune. Should be one of the keys for ``scruples.baselines.$dataset.FINE_TUNE_LM_BASELINES`` where ``$dataset`` corresponds to the ``dataset`` argument to this function. hyper_params : Dict[str, Any] The dictionary of hyper-parameters for the model. loss_type : str The type of loss to use. Should be one of ``"xentropy-hard"``, ``"xentropy-soft"``, ``"xentropy-full"`` or ``"dirichlet-multinomial"``. compute_train_batch_size : int The largest batch size that will fit on the hardware during training. Gradient accumulation will be used to make sure the actual size of the batch on the hardware respects this limit. predict_batch_size : int The number of instances to use in a predicting batch. gpu_ids : Optional[List[int]] A list of IDs for GPUs to use. logger : Optional[logging.Logger], optional (default=None) The logger to use when logging messages. If ``None``, then no messages will be logged. Returns ------- float The best calibrated xentropy on dev achieved after any epoch. bool ``True`` if the training loss diverged, ``False`` otherwise. """ gc.collect() # collect any garbage to make sure old torch objects are cleaned up (and # their memory is freed from the GPU). Otherwise, old tensors can hang # around on the GPU, causing CUDA out-of-memory errors. if loss_type not in settings.LOSS_TYPES: raise ValueError( f'Unrecognized loss type: {loss_type}. Please use one of' f' "xentropy-hard", "xentropy-soft", "xentropy-full" or' f' "dirichlet-multinomial".') # Step 1: Manage and construct paths. if logger is not None: logger.info('Creating the model directory.') checkpoints_dir = os.path.join(model_dir, 'checkpoints') tensorboard_dir = os.path.join(model_dir, 'tensorboard') os.makedirs(model_dir) os.makedirs(checkpoints_dir) os.makedirs(tensorboard_dir) config_file_path = os.path.join(model_dir, 'config.json') log_file_path = os.path.join(model_dir, 'log.txt') best_checkpoint_path = os.path.join( checkpoints_dir, 'best.checkpoint.pkl') last_checkpoint_path = os.path.join( checkpoints_dir, 'last.checkpoint.pkl') # Step 2: Setup the log file. if logger is not None: logger.info('Configuring log files.') log_file_handler = logging.FileHandler(log_file_path) log_file_handler.setLevel(logging.DEBUG) log_file_handler.setFormatter(logging.Formatter(settings.LOG_FORMAT)) logging.root.addHandler(log_file_handler) # Step 3: Record the script's arguments. if logger is not None: logger.info(f'Writing arguments to {config_file_path}.') with open(config_file_path, 'w') as config_file: json.dump({ 'data_dir': data_dir, 'model_dir': model_dir, 'dataset': dataset, 'baseline': baseline, 'hyper_params': hyper_params, 'loss_type': loss_type, 'compute_train_batch_size': compute_train_batch_size, 'predict_batch_size': predict_batch_size, 'gpu_ids': gpu_ids }, config_file) # Step 4: Configure GPUs. if gpu_ids: if logger is not None: logger.info( f'Configuring environment to use {len(gpu_ids)} GPUs:' f' {", ".join(str(gpu_id) for gpu_id in gpu_ids)}.') os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(map(str, gpu_ids)) if not torch.cuda.is_available(): raise EnvironmentError('CUDA must be available to use GPUs.') device = torch.device('cuda') else: if logger is not None: logger.info('Configuring environment to use CPU.') device = torch.device('cpu') # Step 5: Fetch the baseline information and training loop parameters. if logger is not None: logger.info('Retrieving baseline and related parameters.') if dataset == 'resource': Model, baseline_config, _, make_transform =\ resource.FINE_TUNE_LM_BASELINES[baseline] elif dataset == 'corpus': Model, baseline_config, _, make_transform =\ corpus.FINE_TUNE_LM_BASELINES[baseline] else: raise ValueError( f'dataset must be either "resource" or "corpus", not' f' {dataset}.') n_epochs = hyper_params['n_epochs'] train_batch_size = hyper_params['train_batch_size'] n_gradient_accumulation = math.ceil( train_batch_size / (compute_train_batch_size * len(gpu_ids))) # Step 6: Load the dataset. if logger is not None: logger.info(f'Loading the dataset from {data_dir}.') featurize = make_transform(**baseline_config['transform']) if dataset == 'resource': Dataset = ScruplesResourceDataset labelize = None labelize_scores = lambda scores: np.array(scores).astype(float) elif dataset == 'corpus': Dataset = ScruplesCorpusDataset labelize = lambda s: getattr(Label, s).index labelize_scores = lambda scores: np.array([ score for _, score in sorted( scores.items(), key=lambda t: labelize(t[0])) ]).astype(float) else: raise ValueError( f'dataset must be either "resource" or "corpus", not' f' {dataset}.') train = Dataset( data_dir=data_dir, split='train', transform=featurize, label_transform=labelize, label_scores_transform=labelize_scores) dev = Dataset( data_dir=data_dir, split='dev', transform=featurize, label_transform=labelize, label_scores_transform=labelize_scores) train_loader = DataLoader( dataset=train, batch_size=train_batch_size // n_gradient_accumulation, shuffle=True, num_workers=len(gpu_ids), pin_memory=bool(gpu_ids)) dev_loader = DataLoader( dataset=dev, batch_size=predict_batch_size, shuffle=False, num_workers=len(gpu_ids), pin_memory=bool(gpu_ids)) # Step 7: Create the model, optimizer, and loss. if logger is not None: logger.info('Initializing the model.') model = Model(**baseline_config['model']) model.to(device) n_optimization_steps = n_epochs * math.ceil(len(train) / train_batch_size) parameter_groups = [ { 'params': [ param for name, param in model.named_parameters() if 'bias' in name or 'LayerNorm.bias' in name or 'LayerNorm.weight' in name ], 'weight_decay': 0 }, { 'params': [ param for name, param in model.named_parameters() if 'bias' not in name and 'LayerNorm.bias' not in name and 'LayerNorm.weight' not in name ], 'weight_decay': hyper_params['weight_decay'] } ] optimizer = AdamW(parameter_groups, lr=hyper_params['lr']) if loss_type == 'xentropy-hard': loss = torch.nn.CrossEntropyLoss() elif loss_type == 'xentropy-soft': loss = SoftCrossEntropyLoss() elif loss_type == 'xentropy-full': loss = SoftCrossEntropyLoss() elif loss_type == 'dirichlet-multinomial': loss = DirichletMultinomialLoss() xentropy = SoftCrossEntropyLoss() scheduler = WarmupLinearSchedule( optimizer=optimizer, warmup_steps=int( hyper_params['warmup_proportion'] * n_optimization_steps ), t_total=n_optimization_steps) # add data parallelism support model = torch.nn.DataParallel(model) # Step 8: Run training. n_train_batches_per_epoch = math.ceil(len(train) / train_batch_size) n_dev_batch_per_epoch = math.ceil(len(dev) / predict_batch_size) writer = tensorboardX.SummaryWriter(log_dir=tensorboard_dir) best_dev_calibrated_xentropy = math.inf for epoch in range(n_epochs): # set the model to training mode model.train() # run training for the epoch epoch_train_loss = 0 epoch_train_xentropy = 0 for i, (_, features, labels, label_scores) in tqdm.tqdm( enumerate(train_loader), total=n_gradient_accumulation * n_train_batches_per_epoch, **settings.TQDM_KWARGS ): # move the data onto the device features = {k: v.to(device) for k, v in features.items()} # create the targets if loss_type == 'xentropy-hard': targets = labels elif loss_type == 'xentropy-soft': targets = label_scores / torch.unsqueeze( torch.sum(label_scores, dim=-1), dim=-1) elif loss_type == 'xentropy-full': targets = label_scores elif loss_type == 'dirichlet-multinomial': targets = label_scores # create the soft labels soft_labels = label_scores / torch.unsqueeze( torch.sum(label_scores, dim=-1), dim=-1) # move the targets and soft labels to the device targets = targets.to(device) soft_labels = soft_labels.to(device) # make predictions logits = model(**features)[0] batch_loss = loss(logits, targets) batch_xentropy = xentropy(logits, soft_labels) # update training statistics epoch_train_loss = ( batch_loss.item() + i * epoch_train_loss ) / (i + 1) epoch_train_xentropy = ( batch_xentropy.item() + i * epoch_train_xentropy ) / (i + 1) # update the network batch_loss.backward() if (i + 1) % n_gradient_accumulation == 0: optimizer.step() optimizer.zero_grad() scheduler.step() # write training statistics to tensorboard step = n_train_batches_per_epoch * epoch + ( (i + 1) // n_gradient_accumulation) if step % 100 == 0 and (i + 1) % n_gradient_accumulation == 0: writer.add_scalar('train/loss', epoch_train_loss, step) writer.add_scalar('train/xentropy', epoch_train_xentropy, step) # run evaluation with torch.no_grad(): # set the model to evaluation mode model.eval() # run validation for the epoch epoch_dev_loss = 0 epoch_dev_soft_labels = [] epoch_dev_logits = [] for i, (_, features, labels, label_scores) in tqdm.tqdm( enumerate(dev_loader), total=n_dev_batch_per_epoch, **settings.TQDM_KWARGS): # move the data onto the device features = {k: v.to(device) for k, v in features.items()} # create the targets if loss_type == 'xentropy-hard': targets = labels elif loss_type == 'xentropy-soft': targets = label_scores / torch.unsqueeze( torch.sum(label_scores, dim=-1), dim=-1) elif loss_type == 'xentropy-full': targets = label_scores elif loss_type == 'dirichlet-multinomial': targets = label_scores # move the targets to the device targets = targets.to(device) # make predictions logits = model(**features)[0] batch_loss = loss(logits, targets) # update validation statistics epoch_dev_loss = ( batch_loss.item() + i * epoch_dev_loss ) / (i + 1) epoch_dev_soft_labels.extend( ( label_scores / torch.unsqueeze(torch.sum(label_scores, dim=-1), dim=-1) ).cpu().numpy().tolist() ) epoch_dev_logits.extend(logits.cpu().numpy().tolist()) # compute validation statistics epoch_dev_soft_labels = np.array(epoch_dev_soft_labels) epoch_dev_logits = np.array(epoch_dev_logits) calibration_factor = utils.calibration_factor( logits=epoch_dev_logits, targets=epoch_dev_soft_labels) epoch_dev_xentropy = utils.xentropy( y_true=epoch_dev_soft_labels, y_pred=softmax(epoch_dev_logits, axis=-1)) epoch_dev_calibrated_xentropy = utils.xentropy( y_true=epoch_dev_soft_labels, y_pred=softmax(epoch_dev_logits / calibration_factor, axis=-1)) # write validation statistics to tensorboard writer.add_scalar('dev/loss', epoch_dev_loss, step) writer.add_scalar('dev/xentropy', epoch_dev_xentropy, step) writer.add_scalar( 'dev/calibrated-xentropy', epoch_dev_calibrated_xentropy, step) if logger is not None: logger.info( f'\n\n' f' epoch {epoch}:\n' f' train loss : {epoch_train_loss:.4f}\n' f' train xentropy : {epoch_train_xentropy:.4f}\n' f' dev loss : {epoch_dev_loss:.4f}\n' f' dev xentropy : {epoch_dev_xentropy:.4f}\n' f' dev calibrated xentropy : {epoch_dev_calibrated_xentropy:.4f}\n' f' calibration factor : {calibration_factor:.4f}\n') # update checkpoints torch.save( { 'epoch': epoch, 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), 'calibration_factor': calibration_factor }, last_checkpoint_path) # update the current best model if epoch_dev_calibrated_xentropy < best_dev_calibrated_xentropy: shutil.copyfile(last_checkpoint_path, best_checkpoint_path) best_dev_calibrated_xentropy = epoch_dev_calibrated_xentropy # exit early if the training loss has diverged if math.isnan(epoch_train_loss): logger.info('Training loss has diverged. Exiting early.') return best_dev_calibrated_xentropy, True logger.info( f'Training complete. Best dev calibrated xentropy was' f' {best_dev_calibrated_xentropy:.4f}.') return best_dev_calibrated_xentropy, False
def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id): """ Train the model """ # freeze the bert layers to preseve pre-trained embeddings: # model.module.bert.weight.requires_grad_(False) # model.module.bert.bias.requires_grad_(False) for name, param in model.bert.named_parameters(): if name.startswith('embeddings') or name.startswith('encoding'): param.requires_grad = False if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): # set global_step to gobal_step of last saved checkpoint from model path try: global_step = int( args.model_name_or_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except: pass tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproductibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert", "xlnet"] else None ) # XLM and RoBERTa don"t use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev") for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, logger) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs model = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model.resize_token_embeddings(len(tokenizer)) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if (args.model_name_or_path and os.path.isfile( os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt"))): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model( inputs, labels=labels) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join( args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
train_dataset = IMDbDataset(train_encodings, train_labels) val_dataset = IMDbDataset(val_encodings, val_labels) test_dataset = IMDbDataset(test_encodings, test_labels) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') model = BertForSequenceClassification.from_pretrained('bert-base-uncased') model.to(device) model.train() print('initialized bert model') train_loader = DataLoader(train_dataset, batch_size=5, shuffle=True) optim = AdamW(model.parameters(), lr=5e-5) print('starting training\n') for epoch in range(3): print('EPOCH', epoch + 1) batch_num = 1 for batch in train_loader: print(' batch', batch_num) optim.zero_grad() input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) outputs = model(input_ids, attention_mask=attention_mask, labels=labels) loss = outputs[0] loss.backward() optim.step()
def train_model(self, batches): self.model = self.new_model() self.model = self.model.to(self.device) self.optimizer = AdamW(self.model.parameters(), lr=self.params.learning_rate) train_batches, val_batches = train_test_split( batches, shuffle=True, random_state=self.params.random_state, test_size=.1) self.loss = self.create_loss_functions() for epoch in range(self.params.num_epochs): loss_val = 0 self.model.train() for batch in train_batches: X_ids = torch.tensor(batch["inputs"]).to(self.device) X_att = torch.tensor(batch["attentions"]).to(self.device) if len([ x for task_label in self.task_labels for x in batch["masks"][task_label] ]) == 0: continue logits, _ = self.model(X_ids, attn=X_att) class_loss = dict() weighted_sum = 0 for task_label in self.task_labels: masked_logits = logits[task_label][batch["masks"] [task_label]] masked_labels = [ batch["labels"][task_label][x] for x in batch["masks"][task_label] ] if self.multi_task or self.ensemble: masked_labels = torch.tensor(masked_labels).type( "torch.LongTensor").to(self.device) else: masked_labels = torch.tensor(masked_labels).to( self.device) if len(batch["masks"][task_label]) > 0: ## list of loss values for each batch instance class_loss[task_label] = self.loss[task_label]( masked_logits, masked_labels) ## using a column of the data as the weight for loss value of each instance # Batch["weight"] shows the instance weight (based on its certainty), class_weight shows the class weight for positive and negative labels # batch["weights"][batch_i] * """ class_loss[task_label] = sum([ batch_loss[mask_i] * self.class_weight[task_label][masked_labels[mask_i]] for mask_i, batch_i in enumerate(batch["masks"][task_label])]) weighted_sum += sum([self.class_weight[task_label][label] for label in masked_labels]) """ total_loss = sum(class_loss.values()) # / weighted_sum loss_val += total_loss.item() total_loss.backward() self.optimizer.step() print("Epoch", epoch, "-", "Loss", round(loss_val, 3)) if val_batches: val_results = self.predict(val_batches, self.model) print("Validation:") print(self.report_results(val_results))
def train(self, train_dataloader, valid_dataloader=None, test_dataloader=None, should_continue=False): """ Train the model """ tb_writer = SummaryWriter() train_epochs = self.config.train_epochs if self.config.max_steps > 0: train_steps = self.config.max_steps train_epochs = self.config.max_steps // ( len(train_dataloader) // self.config.grad_acc_steps) + 1 else: train_steps = len( train_dataloader) // self.config.grad_acc_steps * train_epochs if self.total_samples and should_continue: steps_total = self.total_samples // self.config.train_bs // self.config.grad_acc_steps * train_epochs else: steps_total = train_steps # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.config.weight_decay, }, { "params": [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] self.optimizer = AdamW( optimizer_grouped_parameters, lr=self.config.lr, eps=self.config.adam_eps, ) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=self.config.warmup_steps, num_training_steps=steps_total) # self.scheduler = get_constant_schedule(self.optimizer) if should_continue and self.global_step > 0: logger.info("loading saved optimizer and scheduler states") assert (self.optimizer_state_dict) assert (self.scheduler_state_dict) self.optimizer.load_state_dict(self.optimizer_state_dict) self.scheduler.load_state_dict(self.scheduler_state_dict) else: logger.info("Using fresh optimizer and scheduler") if self.config.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) self.model, self.optimizer = amp.initialize( self.model, self.optimizer, opt_level=self.config.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if self.config.n_gpu > 1 and not isinstance(self.model, torch.nn.DataParallel): self.model = torch.nn.DataParallel(self.model) logger.info("***** Running training *****") logger.info(" Num examples = %d (%d)", len(train_dataloader.dataset), len(train_dataloader)) logger.info(" Num Epochs = %d", train_epochs) logger.info(" Batch size = %d", self.config.train_bs) logger.info(" Learning rate = %e", self.config.lr) logger.info(" Loss label weights = %s", self.config.loss_label_weights) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", self.config.train_bs * self.config.grad_acc_steps) logger.info(" Gradient Accumulation steps = %d", self.config.grad_acc_steps) logger.info(" Total optimization steps = %d", train_steps) if not should_continue: self.global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # # Check if continuing training from a checkpoint # if os.path.exists(self.config.model_path): # if self.config.should_continue: # step_str = self.config.model_path.split("-")[-1].split("/")[0] # if step_str: # # set self.global_step to gobal_step of last saved checkpoint from model path # self.global_step = int(step_str) # epochs_trained = self.global_step // (len(train_dataloader) // # self.config.grad_acc_steps) # steps_trained_in_current_epoch = self.global_step % ( # len(train_dataloader) // self.config.grad_acc_steps) # logger.info( # " Continuing training from checkpoint, will skip to saved self.global_step") # logger.info( # " Continuing training from epoch %d", epochs_trained) # logger.info( # " Continuing training from global step %d", self.global_step) # logger.info(" Will skip the first %d steps in the first epoch", # steps_trained_in_current_epoch) train_loss = 0.0 self.model.zero_grad() train_iterator = trange( epochs_trained, int(train_epochs), desc="Epoch", ) util.set_seed(self.config) # Added here for reproductibility self.model.train() if self.config.train_head_only: for param in self.model.roberta.embeddings.parameters(): param.requires_grad = False logger.info("Training only head") # for param in self.model.__getattr__(self.config.model_type).roberta.parameters(): # param.requires_grad = False for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue self.model.train() inputs = self.__inputs_from_batch(batch) outputs = self.model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if self.config.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if self.config.grad_acc_steps > 1: loss = loss / self.config.grad_acc_steps if self.config.fp16: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() batch_loss = loss.item() train_loss += batch_loss if (step + 1) % self.config.grad_acc_steps == 0: if self.config.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(self.optimizer), self.config.max_grad_norm) else: torch.nn.utils.clip_grad_norm_( self.model.parameters(), self.config.max_grad_norm) self.optimizer.step() self.scheduler.step() # Update learning rate schedule self.model.zero_grad() self.global_step += 1 if self.config.logging_steps > 0 and self.global_step % self.config.logging_steps == 0: logs = {} if valid_dataloader: result_valid, * \ _ = self.evaluate( 'valid', valid_dataloader, backtrans=(test_dataloader == None)) logs.update({ f"valid_{k}": v for k, v in result_valid.items() }) if test_dataloader: test_dataloader = test_dataloader if isinstance( test_dataloader, dict) else { 'test': test_dataloader } for eval_name, dataloader_or_tuple in test_dataloader.items( ): if isinstance(dataloader_or_tuple, tuple): dataloader, kwargs = dataloader_or_tuple else: dataloader = dataloader_or_tuple kwargs = {} result_test, * \ _ = self.evaluate( eval_name, dataloader, **kwargs) logs.update({ f"{eval_name}_{k}": v for k, v in result_test.items() }) learning_rate_scalar = self.scheduler.get_last_lr()[0] logger.info("Learning rate: %f (at step %d)", learning_rate_scalar, step) logs["learning_rate"] = learning_rate_scalar logs["train_loss"] = train_loss self.after_logging(logs) logger.info("Batch loss: %f", batch_loss) # for key, value in logs.items(): # tb_writer.add_scalar(key, value, self.global_step) if self.config.save_steps > 0 and self.global_step % self.config.save_steps == 0: # Save model checkpoint self.save_checkpoint() if self.config.max_steps > 0 and self.global_step > self.config.max_steps: epoch_iterator.close() break if self.config.max_steps > 0 and self.global_step > self.config.max_steps: train_iterator.close() break if self.config.train_head_only: logger.info("Training only head") # for param in self.model.__getattr__(self.config.model_type).parameters(): # param.requires_grad = True for param in self.model.roberta.embeddings.parameters(): param.requires_grad = False tb_writer.close() self.optimizer_state_dict = self.optimizer.state_dict() self.scheduler_state_dict = self.scheduler.state_dict() avg_train_loss = train_loss / self.global_step logger.info("Learning rate now: %s", self.scheduler.get_last_lr()) logger.info("***** Done training *****") return self.global_step, avg_train_loss
class ToxicityClassifier(): def __init__(self, data, annotators, params, task_labels=["toxic"]): self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.device = torch.device('cuda') self.data = data self.annotators = annotators self.multi_label, self.multi_task, self.ensemble, self.single, self.log_reg = False, False, False, False, False setattr(self, params.task, True) if self.single or self.log_reg: self.task_labels = task_labels else: self.task_labels = annotators self.majority_vote() self.uncertainty() print("Data shape after majority voting", self.data.shape) # Setting the parameters self.params = params print([(k, v) for k, v in self.params.__dict__.items()]) def majority_vote(self): self.data["toxic"] = (self.data[self.annotators].sum(axis=1) / \ self.data[self.annotators].count(axis=1) >= 0.5).astype(int) print(sum(self.data["toxic"])) def uncertainty(self): self.data["uncertainty"] = (self.data[self.annotators].sum(axis=1) \ * (self.data[self.annotators].count(axis=1) - self.data[self.annotators].sum( axis=1)) \ / (self.data[self.annotators].count(axis=1) * self.data[self.annotators].count( axis=1))) def CV(self): if self.ensemble: ensemble_results = pd.DataFrame() for annotator in self.annotators: print("Training model for annotator", annotator) self.task_labels = ["toxic"] scores, results = self._CV( self.data.rename(columns={ annotator: "toxic", "toxic": "_toxic" })) ensemble_results[annotator + "_pred"] = results["toxic_pred"] ensemble_results[annotator + "_label"] = results["toxic_label"] ensemble_results[annotator + "_masked_pred"] = results["toxic_masked_pred"] ensemble_results[ annotator + "_masked_label"] = results["toxic_masked_label"] self.task_labels = self.annotators scores = self.report_results(ensemble_results) return scores, ensemble_results else: return self._CV(self.data) def masks(self, df): df = df.replace(0, 1) df = df.replace(np.nan, 0) new_labels = LabelEncoder().fit_transform( [''.join(str(l) for l in row) for i, row in df.iterrows()]) return new_labels def _CV(self, data): if self.params.stratified: kfold = StratifiedKFold(n_splits=self.params.num_folds, shuffle=True, random_state=self.params.random_state) else: kfold = KFold(n_splits=self.params.num_folds, shuffle=True, random_state=self.params.random_state) results = pd.DataFrame() i = 1 for train_idx, test_idx in kfold.split( np.zeros(self.data.shape[0]), self.masks(self.data[self.annotators])): print("Fold #", i) train = data.loc[train_idx].reset_index() test = data.loc[test_idx].reset_index() """ if i == 1: test.to_csv(os.path.join(self.params.source_dir, "results", "GHC", "test_file.csv"), index=False) else: test.to_csv(os.path.join(self.params.source_dir, "results", "GHC", "test_file.csv"), index=False, header=False, mode="a") """ train_batches = self.get_batches(train) test_batches = self.get_batches(test) self.train_model(train_batches) if self.params.predict == "label": # testing on the validation set fold_result = self.predict(test_batches) print("Test:") print(self.report_results(fold_result)) fold_result["fold"] = pd.Series([i for id in test_idx]) results = results.append(fold_result) i += 1 elif self.params.predict == "mc": certainty_results = self.mc_predict(test_batches) fold_result = self.predict(test_batches) fold_result["fold"] = pd.Series([i for id in test_idx]) fold_result = fold_result.join(certainty_results) results = results.append(fold_result) scores = self.report_results(results) print(scores) return scores, results def new_model(self): if self.multi_task: return ClassifierBert(self.device, tasks=self.annotators) elif self.multi_label: return ClassifierBert(self.device, labels=len(self.annotators)) elif self.log_reg: return ClassifierBert(self.device, labels=1, tasks=self.task_labels) else: return ClassifierBert(self.device) def create_loss_functions(self): losses = dict() # self.class_weight = dict() for task_label in self.task_labels: _labels = [int(x) for x in self.data[task_label].dropna().tolist()] weight = compute_class_weight('balanced', np.unique(_labels), _labels) if len(weight) == 1: weight = [0.01, 1] weight = torch.tensor(weight, dtype=torch.float32).to(self.device) if self.multi_label: losses[task_label] = nn.BCEWithLogitsLoss( reduction="sum") # , pos_weight=class_weight) elif self.log_reg: losses[task_label] = nn.MSELoss() else: losses[task_label] = nn.CrossEntropyLoss(weight=weight) return losses def train_model(self, batches): self.model = self.new_model() self.model = self.model.to(self.device) self.optimizer = AdamW(self.model.parameters(), lr=self.params.learning_rate) train_batches, val_batches = train_test_split( batches, shuffle=True, random_state=self.params.random_state, test_size=.1) self.loss = self.create_loss_functions() for epoch in range(self.params.num_epochs): loss_val = 0 self.model.train() for batch in train_batches: X_ids = torch.tensor(batch["inputs"]).to(self.device) X_att = torch.tensor(batch["attentions"]).to(self.device) if len([ x for task_label in self.task_labels for x in batch["masks"][task_label] ]) == 0: continue logits, _ = self.model(X_ids, attn=X_att) class_loss = dict() weighted_sum = 0 for task_label in self.task_labels: masked_logits = logits[task_label][batch["masks"] [task_label]] masked_labels = [ batch["labels"][task_label][x] for x in batch["masks"][task_label] ] if self.multi_task or self.ensemble: masked_labels = torch.tensor(masked_labels).type( "torch.LongTensor").to(self.device) else: masked_labels = torch.tensor(masked_labels).to( self.device) if len(batch["masks"][task_label]) > 0: ## list of loss values for each batch instance class_loss[task_label] = self.loss[task_label]( masked_logits, masked_labels) ## using a column of the data as the weight for loss value of each instance # Batch["weight"] shows the instance weight (based on its certainty), class_weight shows the class weight for positive and negative labels # batch["weights"][batch_i] * """ class_loss[task_label] = sum([ batch_loss[mask_i] * self.class_weight[task_label][masked_labels[mask_i]] for mask_i, batch_i in enumerate(batch["masks"][task_label])]) weighted_sum += sum([self.class_weight[task_label][label] for label in masked_labels]) """ total_loss = sum(class_loss.values()) # / weighted_sum loss_val += total_loss.item() total_loss.backward() self.optimizer.step() print("Epoch", epoch, "-", "Loss", round(loss_val, 3)) if val_batches: val_results = self.predict(val_batches, self.model) print("Validation:") print(self.report_results(val_results)) def predict(self, batches, model=None): self.model.eval() results = defaultdict(list) for batch in batches: X_ids = torch.tensor(batch["inputs"]).to(self.device) X_att = torch.tensor(batch["attentions"]).to(self.device) logits, predictions = self.model(X_ids, attn=X_att) for task_label in self.task_labels: masked_labels = [ x if x in batch["masks"][task_label] else np.nan for x in batch["labels"][task_label] ] masked_predictions = [ x if x in batch["masks"][task_label] else np.nan for x in predictions[task_label] ] results[task_label + "_masked_pred"].extend(masked_predictions) results[task_label + "_masked_label"].extend(masked_labels) results[task_label + "_pred"].extend(predictions[task_label]) results[task_label + "_label"].extend( batch["labels"][task_label]) if self.params.task == "single": results[task_label + "_logit"].extend( softmax(logits[task_label].cpu().detach().numpy(), axis=1)[:, 1]) return pd.DataFrame.from_dict(results) def mc_predict(self, batches, model=None): results = defaultdict(list) soft = nn.Softmax(dim=1) num_samples = sum([batch["batch_len"] for batch in batches]) dropout_predictions = np.empty((0, num_samples, 1)) for task_label in self.task_labels: for mc_pass in range(self.params.mc_passes): self.model.eval() self.enable_dropout(self.model) mc_predictions = np.empty((0, 1)) for batch in batches: X_ids = torch.tensor(batch["inputs"]).to(self.device) X_att = torch.tensor(batch["attentions"]).to(self.device) logits, predictions = self.model(X_ids, attn=X_att) predictions = np.array(predictions[task_label]) mc_predictions = np.vstack( (mc_predictions, predictions[:, np.newaxis])) dropout_predictions = np.vstack( (dropout_predictions, mc_predictions[np.newaxis, :])) results[task_label + "_mean"] = list( np.squeeze(np.mean(dropout_predictions, axis=0))) results[task_label + "_variance"] = list( np.squeeze(np.var(dropout_predictions, axis=0))) return pd.DataFrame.from_dict(results) def enable_dropout(self, model): for m in model.modules(): if m.__class__.__name__.startswith('Dropout'): m.train() def report_results(self, results): if self.log_reg: label_col = self.task_labels[0] + "_label" pred_col = self.task_labels[0] + "_pred" r2 = r2_score(results[label_col], results[pred_col]) scores = {"r2": round(r2, 4)} return scores if len(self.task_labels) > 1: label_cols = [col + "_label" for col in self.annotators] pred_cols = [col + "_pred" for col in self.annotators] masked_label_cols = [ col + "_masked_label" for col in self.annotators ] masked_pred_cols = [ col + "_masked_pred" for col in self.annotators ] toxic_label = results[label_cols].sum( axis=1) / results[label_cols].count(axis=1) >= 0.5 toxic_pred = results[pred_cols].sum( axis=1) / results[pred_cols].count(axis=1) >= 0.5 masked_toxic_label = results[masked_label_cols].sum( axis=1) / results[masked_label_cols].count(axis=1) >= 0.5 masked_toxic_pred = results[masked_pred_cols].sum( axis=1) / results[masked_pred_cols].count(axis=1) >= 0.5 print("Accuracy of the majority vote (after masking):") result_cat = masked_toxic_label.map({ True: "T", False: "F" }) + masked_toxic_pred.map({ True: "T", False: "F" }) result_cat = result_cat.map({ "TT": "TP", "FF": "TN", "TF": "FN", "FT": "FP" }) true_results = result_cat.isin(["TP", "TN"]) counts = Counter(result_cat) a = Counter(true_results)[True] / results.shape[0] p = counts["TP"] / max((counts["TP"] + counts["FP"]), 1) r = counts["TP"] / max((counts["TP"] + counts["FN"]), 1) try: f = 2 * p * r / (p + r) except Exception: f = 0 print({ "A": round(a, 4), "P": round(p, 4), "R": round(r, 4), "F1": round(f, 4) }) print("Accuracy of the majority vote (using all annotator heads):") else: toxic_label = results["toxic_label"] == 1 toxic_pred = results["toxic_pred"] == 1 print("Accuracy of single label") result_cat = toxic_label.map({ True: "T", False: "F" }) + toxic_pred.map({ True: "T", False: "F" }) result_cat = result_cat.map({ "TT": "TP", "FF": "TN", "TF": "FN", "FT": "FP" }) true_results = result_cat.isin(["TP", "TN"]) counts = Counter(result_cat) a = Counter(true_results)[True] / results.shape[0] p = counts["TP"] / max((counts["TP"] + counts["FP"]), 1) r = counts["TP"] / max((counts["TP"] + counts["FN"]), 1) try: f = 2 * p * r / (p + r) except Exception: f = 0 scores = { "A": round(a, 4), "P": round(p, 4), "R": round(r, 4), "F1": round(f, 4) } return scores def get_batches(self, data): if isinstance(self.params.sort_by, str): data = data.sort_values(by=[self.params.sort_by], ascending=False).reset_index() batches = list() for s in range(0, len(data), self.params.batch_size): e = s + self.params.batch_size if s + self.params.batch_size < len( data) else len(data) data_info = self.batch_to_info(data["text"].tolist()[s:e]) anno_batch = dict() mask_batch = dict() for task_label in self.task_labels: anno_batch[task_label] = data[task_label].tolist()[s:e] mask_batch[task_label] = [i for i, h in enumerate(anno_batch[task_label]) \ if not math.isnan(h)] data_info["labels"] = anno_batch data_info["masks"] = mask_batch # data_info["majority_vote"] = data["toxic"].tolist()[s: e] data_info["batch_len"] = e - s if isinstance(self.params.batch_weight, str): data_info["weights"] = data[ self.params.batch_weight].tolist()[s:e] else: data_info["weights"] = [1 for i in range(e - s)] batches.append(data_info) return batches def batch_to_info(self, batch): batch_info = dict() if isinstance(self.params.max_len, int): tokens = self.tokenizer(batch, padding="max_length", max_length=self.params.max_len, truncation=True) else: tokens = self.tokenizer(batch, padding=True, truncation=True) batch_info["inputs"] = tokens["input_ids"] batch_info["attentions"] = tokens["attention_mask"] return batch_info
hidden_size=768, drop_rate=0.1) model_name = 'bert_encoder_on_fewrel' # set optimizer batch_size = 32 train_epoch = 10 param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False) framework.train_encoder_epoch(model, model_name, optimizer=optimizer, batch_size=batch_size, train_epoch=train_epoch, learning_rate=2e-5, warmup=True)
class MAML: def __init__(self, device, **kwargs): self.inner_lr = kwargs.get('inner_lr') self.meta_lr = kwargs.get('meta_lr') self.write_prob = kwargs.get('write_prob') self.replay_rate = kwargs.get('replay_rate') self.replay_every = kwargs.get('replay_every') self.device = device self.pn = TransformerClsModel(model_name=kwargs.get('model'), n_classes=1, max_length=kwargs.get('max_length'), device=device) logger.info('Loaded {} as PN'.format(self.pn.__class__.__name__)) meta_params = [p for p in self.pn.parameters() if p.requires_grad] self.meta_optimizer = AdamW(meta_params, lr=self.meta_lr) self.memory = ReplayMemory(write_prob=self.write_prob, tuple_size=3) self.loss_fn = nn.BCEWithLogitsLoss() inner_params = [p for p in self.pn.parameters() if p.requires_grad] self.inner_optimizer = optim.SGD(inner_params, lr=self.inner_lr) def save_model(self, model_path): checkpoint = self.pn.state_dict() torch.save(checkpoint, model_path) def load_model(self, model_path): checkpoint = torch.load(model_path) self.pn.load_state_dict(checkpoint) def evaluate(self, dataloader, updates, mini_batch_size): self.pn.train() support_set = [] for _ in range(updates): text, label, candidates = self.memory.read_batch(batch_size=mini_batch_size) support_set.append((text, label, candidates)) with higher.innerloop_ctx(self.pn, self.inner_optimizer, copy_initial_weights=False, track_higher_grads=False) as (fpn, diffopt): # Inner loop task_predictions, task_labels = [], [] support_loss = [] for text, label, candidates in support_set: replicated_text, replicated_relations, ranking_label = datasets.utils.replicate_rel_data(text, label, candidates) input_dict = self.pn.encode_text(list(zip(replicated_text, replicated_relations))) output = fpn(input_dict) targets = torch.tensor(ranking_label).float().unsqueeze(1).to(self.device) loss = self.loss_fn(output, targets) diffopt.step(loss) pred, true_labels = models.utils.make_rel_prediction(output, ranking_label) support_loss.append(loss.item()) task_predictions.extend(pred.tolist()) task_labels.extend(true_labels.tolist()) acc = models.utils.calculate_accuracy(task_predictions, task_labels) logger.info('Support set metrics: Loss = {:.4f}, accuracy = {:.4f}'.format(np.mean(support_loss), acc)) all_losses, all_predictions, all_labels = [], [], [] for text, label, candidates in dataloader: replicated_text, replicated_relations, ranking_label = datasets.utils.replicate_rel_data(text, label, candidates) with torch.no_grad(): input_dict = self.pn.encode_text(list(zip(replicated_text, replicated_relations))) output = fpn(input_dict) targets = torch.tensor(ranking_label).float().unsqueeze(1).to(self.device) loss = self.loss_fn(output, targets) loss = loss.item() pred, true_labels = models.utils.make_rel_prediction(output, ranking_label) all_losses.append(loss) all_predictions.extend(pred.tolist()) all_labels.extend(true_labels.tolist()) acc = models.utils.calculate_accuracy(all_predictions, all_labels) logger.info('Test metrics: Loss = {:.4f}, accuracy = {:.4f}'.format(np.mean(all_losses), acc)) return acc def training(self, train_datasets, **kwargs): updates = kwargs.get('updates') mini_batch_size = kwargs.get('mini_batch_size') if self.replay_rate != 0: replay_batch_freq = self.replay_every // mini_batch_size replay_freq = int(math.ceil((replay_batch_freq + 1) / (updates + 1))) replay_steps = int(self.replay_every * self.replay_rate / mini_batch_size) else: replay_freq = 0 replay_steps = 0 logger.info('Replay frequency: {}'.format(replay_freq)) logger.info('Replay steps: {}'.format(replay_steps)) concat_dataset = data.ConcatDataset(train_datasets) train_dataloader = iter(data.DataLoader(concat_dataset, batch_size=mini_batch_size, shuffle=False, collate_fn=datasets.utils.rel_encode)) episode_id = 0 while True: self.inner_optimizer.zero_grad() support_loss, support_acc = [], [] with higher.innerloop_ctx(self.pn, self.inner_optimizer, copy_initial_weights=False, track_higher_grads=False) as (fpn, diffopt): # Inner loop support_set = [] task_predictions, task_labels = [], [] for _ in range(updates): try: text, label, candidates = next(train_dataloader) support_set.append((text, label, candidates)) except StopIteration: logger.info('Terminating training as all the data is seen') return for text, label, candidates in support_set: replicated_text, replicated_relations, ranking_label = datasets.utils.replicate_rel_data(text, label, candidates) input_dict = self.pn.encode_text(list(zip(replicated_text, replicated_relations))) output = fpn(input_dict) targets = torch.tensor(ranking_label).float().unsqueeze(1).to(self.device) loss = self.loss_fn(output, targets) diffopt.step(loss) pred, true_labels = models.utils.make_rel_prediction(output, ranking_label) support_loss.append(loss.item()) task_predictions.extend(pred.tolist()) task_labels.extend(true_labels.tolist()) self.memory.write_batch(text, label, candidates) acc = models.utils.calculate_accuracy(task_predictions, task_labels) logger.info('Episode {} support set: Loss = {:.4f}, accuracy = {:.4f}'.format(episode_id + 1, np.mean(support_loss), acc)) # Outer loop query_loss, query_acc = [], [] query_set = [] if self.replay_rate != 0 and (episode_id + 1) % replay_freq == 0: for _ in range(replay_steps): text, label, candidates = self.memory.read_batch(batch_size=mini_batch_size) query_set.append((text, label, candidates)) else: try: text, label, candidates = next(train_dataloader) query_set.append((text, label, candidates)) self.memory.write_batch(text, label, candidates) except StopIteration: logger.info('Terminating training as all the data is seen') return for text, label, candidates in query_set: replicated_text, replicated_relations, ranking_label = datasets.utils.replicate_rel_data(text, label, candidates) input_dict = self.pn.encode_text(list(zip(replicated_text, replicated_relations))) output = fpn(input_dict) targets = torch.tensor(ranking_label).float().unsqueeze(1).to(self.device) loss = self.loss_fn(output, targets) query_loss.append(loss.item()) pred, true_labels = models.utils.make_rel_prediction(output, ranking_label) acc = models.utils.calculate_accuracy(pred.tolist(), true_labels.tolist()) query_acc.append(acc) # PN meta gradients pn_params = [p for p in fpn.parameters() if p.requires_grad] meta_pn_grads = torch.autograd.grad(loss, pn_params) pn_params = [p for p in self.pn.parameters() if p.requires_grad] for param, meta_grad in zip(pn_params, meta_pn_grads): if param.grad is not None: param.grad += meta_grad.detach() else: param.grad = meta_grad.detach() # Meta optimizer step self.meta_optimizer.step() self.meta_optimizer.zero_grad() logger.info('Episode {} query set: Loss = {:.4f}, accuracy = {:.4f}'.format(episode_id + 1, np.mean(query_loss), np.mean(query_acc))) episode_id += 1 def testing(self, test_dataset, **kwargs): updates = kwargs.get('updates') mini_batch_size = kwargs.get('mini_batch_size') test_dataloader = data.DataLoader(test_dataset, batch_size=mini_batch_size, shuffle=False, collate_fn=datasets.utils.rel_encode) acc = self.evaluate(dataloader=test_dataloader, updates=updates, mini_batch_size=mini_batch_size) logger.info('Overall test metrics: Accuracy = {:.4f}'.format(acc)) return acc
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str) parser.add_argument("--eval_file", default=None, type=str) parser.add_argument("--test_file", default=None, type=str) parser.add_argument("--model_name_or_path", default=None, type=str) parser.add_argument("--output_dir", default=None, type=str) ## other parameters parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=256, type=int) parser.add_argument("--do_train", default=False, type=boolean_string) parser.add_argument("--do_eval", default=False, type=boolean_string) parser.add_argument("--do_test", default=False, type=boolean_string) parser.add_argument("--train_batch_size", default=8, type=int) parser.add_argument("--eval_batch_size", default=8, type=int) parser.add_argument("--learning_rate", default=3e-5, type=float) parser.add_argument("--num_train_epochs", default=10, type=float) parser.add_argument("--warmup_proprotion", default=0.1, type=float) parser.add_argument("--use_weight", default=1, type=int) parser.add_argument("--local_rank", type=int, default=-1) parser.add_argument("--seed", type=int, default=2020) parser.add_argument("--fp16", default=False) parser.add_argument("--loss_scale", type=float, default=0) parser.add_argument('--gradient_accumulation_steps', type=int, default=1) parser.add_argument("--warmup_steps", default=0, type=int) parser.add_argument("--adam_epsilon", default=1e-8, type=float) parser.add_argument("--max_steps", default=-1, type=int) parser.add_argument("--do_lower_case", action='store_true') parser.add_argument("--logging_steps", default=500, type=int) parser.add_argument("--clean", default=False, type=boolean_string, help="clean the output dir") parser.add_argument("--need_birnn", default=False, type=boolean_string) parser.add_argument("--rnn_dim", default=128, type=int) args = parser.parse_args() device = torch.device("cuda") # os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ args.device = device n_gpu = torch.cuda.device_count() logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger.info(f"device: {device} n_gpu: {n_gpu}") if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) # now_time = datetime.datetime.now().strftime('%Y-%m-%d_%H') # tmp_dir = args.output_dir + '/' +str(now_time) + '_ernie' # if not os.path.exists(tmp_dir): # os.makedirs(tmp_dir) # args.output_dir = tmp_dir if args.clean and args.do_train: # logger.info("清理") if os.path.exists(args.output_dir): def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) print(c_path) if os.path.isdir(c_path): del_file(c_path) os.rmdir(c_path) else: os.remove(c_path) try: del_file(args.output_dir) except Exception as e: print(e) print('pleace remove the files of output dir and data.conf') exit(-1) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if not os.path.exists(os.path.join(args.output_dir, "eval")): os.makedirs(os.path.join(args.output_dir, "eval")) writer = SummaryWriter(logdir=os.path.join(args.output_dir, "eval"), comment="Linear") processor = NerProcessor() label_list = get_labels(r"./data/labels.txt") num_labels = len(label_list) args.label_list = label_list if os.path.exists(os.path.join(args.output_dir, "label2id.pkl")): with open(os.path.join(args.output_dir, "label2id.pkl"), "rb") as f: label2id = pickle.load(f) else: label2id = {l: i for i, l in enumerate(label_list)} with open(os.path.join(args.output_dir, "label2id.pkl"), "wb") as f: pickle.dump(label2id, f) id2label = {value: key for key, value in label2id.items()} # Prepare optimizer and schedule (linear warmup and decay) if args.do_train: tokenizer = BertTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels) model = BERT_BiLSTM_CRF.from_pretrained(args.model_name_or_path, config=config, need_birnn=args.need_birnn, rnn_dim=args.rnn_dim) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) train_examples, train_features, train_data = get_Dataset(args, processor, tokenizer, label_list, mode="train") train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) if args.do_eval: eval_examples, eval_features, eval_data = get_Dataset(args, processor, tokenizer, label_list, mode="eval") if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_data)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Total optimization steps = %d", t_total) model.train() global_step = 0 tr_loss, logging_loss = 0.0, 0.0 best_f1 = 0.0 for ep in trange(int(args.num_train_epochs), desc="Epoch"): model.train() for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids, bbox, bbox_pos_id, bbox_num = batch outputs = model(input_ids, bbox, bbox_pos_id, bbox_num, label_ids, segment_ids, input_mask) loss = outputs if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: tr_loss_avg = (tr_loss - logging_loss) / args.logging_steps writer.add_scalar("Train/loss", tr_loss_avg, global_step) logging_loss = tr_loss if args.do_eval: all_ori_tokens_eval = [f.ori_tokens for f in eval_features] overall, by_type = evaluate(args, eval_data, model, id2label, all_ori_tokens_eval) # add eval result to tensorboard f1_score = overall.fscore writer.add_scalar("Eval/precision", overall.prec, ep) writer.add_scalar("Eval/recall", overall.rec, ep) writer.add_scalar("Eval/f1_score", overall.fscore, ep) # save the best performs model if f1_score > best_f1: logger.info( f"----------the best f1 is {f1_score}---------") best_f1 = f1_score model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save( args, os.path.join(args.output_dir, 'training_args.bin')) # logger.info(f'epoch {ep}, train loss: {tr_loss}') # writer.add_graph(model) writer.close() # model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training # model_to_save.save_pretrained(args.output_dir) # tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model # torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) if args.do_test: # model = BertForTokenClassification.from_pretrained(args.output_dir) # model.to(device) label_map = {i: label for i, label in enumerate(label_list)} tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) #args = torch.load(os.path.join(args.output_dir, 'training_args.bin')) model = BERT_BiLSTM_CRF.from_pretrained(args.output_dir, need_birnn=args.need_birnn, rnn_dim=args.rnn_dim) model.to(device) test_examples, test_features, test_data = get_Dataset(args, processor, tokenizer, label_list, mode="test") logger.info("***** Running test *****") logger.info(f" Num examples = {len(test_examples)}") logger.info(f" Batch size = {args.eval_batch_size}") all_ori_tokens = [f.ori_tokens for f in test_features] all_ori_labels = [e.label for e in test_examples] test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) model.eval() pred_labels = [] for b_i, (input_ids, input_mask, segment_ids, label_ids, bbox, bbox_pos_id, bbox_num) in enumerate( tqdm(test_dataloader, desc="Predicting")): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) bbox = bbox.to(device) bbox_pos_id = bbox_pos_id.to(device) bbox_num = bbox_num.to(device) with torch.no_grad(): logits = model.predict(input_ids, segment_ids, input_mask, bbox, bbox_pos_id, bbox_num) # logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2) # logits = logits.detach().cpu().numpy() for l in logits: pred_label = [] for idx in l: pred_label.append(id2label[idx]) pred_labels.append(pred_label) assert len(pred_labels) == len(all_ori_tokens) == len(all_ori_labels) print(len(pred_labels)) with open(os.path.join(args.output_dir, "token_labels_.txt"), "w", encoding="utf-8") as f: for ori_tokens, ori_labels, prel in zip(all_ori_tokens, all_ori_labels, pred_labels): for ot, ol, pl in zip(ori_tokens, ori_labels, prel): if ot in ["[CLS]", "[SEP]"]: f.write("\n") continue else: f.write(f"{ot} {ol} {pl}\n") f.write("\n")
def train(args, processor, model, tokenizer): """ Train the model """ tb_writer = SummaryWriter() train_dataset = load_and_cache_examples(args, processor, tokenizer, evaluate=False) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch") set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2], 'labels': batch[3] } outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, processor, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break tb_writer.close() return global_step, tr_loss / global_step
def configure_optimizers(self): return AdamW( self.model.parameters(), lr=self.hparams.learning_rate, correct_bias=True )
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3] } if args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] if args.model_type in [ 'bert', 'xlnet' ] else None # XLM, DistilBERT and RoBERTa don't use segment_ids outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in transformers (see doc) # print("loss: "+ str(loss)) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1 ) % args.gradient_accumulation_steps == 0 and not args.tpu: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if args.tpu: args.xla_model.optimizer_step(optimizer, barrier=True) model.zero_grad() global_step += 1 if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(args, model, train_dataset, dev_dataset=None, test_dataset=None): train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=int(t_total * args.warmup_proportion), num_training_steps=t_total) # if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile( # os.path.join(args.model_name_or_path, "scheduler.pt") # ): # # Load optimizer and scheduler states # optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) # scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Total train batch size = %d", args.train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) logger.info(" Logging steps = %d", args.logging_steps) logger.info(" Save steps = %d", args.save_steps) global_step = 0 tr_loss = 0.0 model.zero_grad() mb = master_bar(range(int(args.num_train_epochs))) for epoch in mb: epoch_iterator = progress_bar(train_dataloader, parent=mb) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type not in ["distilkobert", "xlm-roberta"]: inputs["token_type_ids"] = batch[ 2] # Distilkobert, XLM-Roberta don't use segment_ids outputs = model(**inputs) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0 or ( len(train_dataloader) <= args.gradient_accumulation_steps and (step + 1) == len(train_dataloader)): torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if args.logging_steps > 0 and global_step % args.logging_steps == 0: if args.evaluate_test_during_training: evaluate(args, model, test_dataset, "test", global_step) else: evaluate(args, model, dev_dataset, "dev", global_step) if args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = (model.module if hasattr(model, "module") else model) model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info( "Saving model checkpoint to {}".format(output_dir)) if args.save_optimizer: torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info( "Saving optimizer and scheduler states to {}". format(output_dir)) if args.max_steps > 0 and global_step > args.max_steps: break mb.write("Epoch {} done".format(epoch + 1)) if args.max_steps > 0 and global_step > args.max_steps: break return global_step, tr_loss / global_step
def main(params: dict, output_dir: str): import mlflow print("start params={}".format(params)) logger = get_logger() # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle") df = pd.read_pickle( "../input/riiid-test-answer-prediction/split10/train_0.pickle" ).sort_values(["user_id", "timestamp"]).reset_index(drop=True) if is_debug: df = df.head(30000) column_config = { ("content_id", "content_type_id"): { "type": "category" }, "user_answer": { "type": "category" }, "part": { "type": "category" }, "prior_question_elapsed_time_bin300": { "type": "category" }, "duration_previous_content_bin300": { "type": "category" } } if not load_pickle or is_debug: feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent() feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="train_0", load_feature=not is_debug, save_feature=not is_debug) print("all_predict") df = feature_factory_manager.all_predict(df) df = df[[ "user_id", "content_id", "content_type_id", "part", "user_answer", "answered_correctly", "prior_question_elapsed_time_bin300", "duration_previous_content_bin300" ]] print(df.head(10)) print("data preprocess") train_idx = [] val_idx = [] np.random.seed(0) for _, w_df in df[df["content_type_id"] == 0].groupby("user_id"): if np.random.random() < 0.01: # all val val_idx.extend(w_df.index.tolist()) else: train_num = int(len(w_df) * 0.95) train_idx.extend(w_df[:train_num].index.tolist()) val_idx.extend(w_df[train_num:].index.tolist()) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) ff_for_transformer.make_dict(df=pd.DataFrame()) n_skill = len(ff_for_transformer.embbed_dict[("content_id", "content_type_id")]) if not load_pickle or is_debug: df["is_val"] = 0 df["is_val"].loc[val_idx] = 1 w_df = df[df["is_val"] == 0] w_df["group"] = ( w_df.groupby("user_id")["user_id"].transform("count") - w_df.groupby("user_id").cumcount()) // params["max_seq"] w_df["user_id"] = w_df["user_id"].astype( str) + "_" + w_df["group"].astype(str) group = ff_for_transformer.all_predict(w_df) dataset_train = SAKTDataset(group, n_skill=n_skill, max_seq=params["max_seq"]) del w_df gc.collect() ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) if not load_pickle or is_debug: group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0]) dataset_val = SAKTDataset(group, is_test=True, n_skill=n_skill, max_seq=params["max_seq"]) os.makedirs("../input/feature_engineering/model064", exist_ok=True) if not is_debug and not load_pickle: with open(f"../input/feature_engineering/model064/train.pickle", "wb") as f: pickle.dump(dataset_train, f) with open(f"../input/feature_engineering/model064/val.pickle", "wb") as f: pickle.dump(dataset_val, f) if not is_debug and load_pickle: with open(f"../input/feature_engineering/model064/train.pickle", "rb") as f: dataset_train = pickle.load(f) with open(f"../input/feature_engineering/model064/val.pickle", "rb") as f: dataset_val = pickle.load(f) print("loaded!") dataloader_train = DataLoader(dataset_train, batch_size=params["batch_size"], shuffle=True, num_workers=1) dataloader_val = DataLoader(dataset_val, batch_size=params["batch_size"], shuffle=False, num_workers=1) model = SAKTModel(n_skill, embed_dim=params["embed_dim"], max_seq=params["max_seq"], dropout=dropout) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW( optimizer_grouped_parameters, lr=params["lr"], weight_decay=0.01, ) num_train_optimization_steps = int(len(dataloader_train) * epochs) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=params["num_warmup_steps"], num_training_steps=num_train_optimization_steps) criterion = nn.BCEWithLogitsLoss() model.to(device) criterion.to(device) for epoch in range(epochs): loss, acc, auc, auc_val = train_epoch(model, dataloader_train, dataloader_val, optimizer, criterion, scheduler, device) print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}". format(epoch, loss, auc, auc_val)) preds = [] labels = [] for item in tqdm(dataloader_val): x = item["x"].to(device).long() target_id = item["target_id"].to(device).long() part = item["part"].to(device).long() label = item["label"].to(device).float() elapsed_time = item["elapsed_time"].to(device).long() duration_previous_content = item["duration_previous_content"].to( device).long() output = model(x, target_id, part, elapsed_time, duration_previous_content) preds.extend(torch.nn.Sigmoid()( output[:, -1]).view(-1).data.cpu().numpy().tolist()) labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist()) auc_transformer = roc_auc_score(labels, preds) print("single transformer: {:.4f}".format(auc_transformer)) df_oof = pd.DataFrame() # df_oof["row_id"] = df.loc[val_idx].index print(len(dataloader_val)) print(len(preds)) df_oof["predict"] = preds df_oof["target"] = labels df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False) """ df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv") df_oof2.columns = ["row_id", "predict_lgbm", "target"] df_oof2 = pd.merge(df_oof, df_oof2, how="inner") auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values) print("lgbm: {:.4f}".format(auc_lgbm)) print("ensemble") max_auc = 0 max_nn_ratio = 0 for r in np.arange(0, 1.05, 0.05): auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r) print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc)) if max_auc < auc: max_auc = auc max_nn_ratio = r print(len(df_oof2)) """ if not is_debug: mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__)) for key, value in params.items(): mlflow.log_param(key, value) mlflow.log_metric("auc_val", auc_transformer) mlflow.end_run() torch.save(model.state_dict(), f"{output_dir}/transformers.pth") del model with open(f"{output_dir}/transformer_param.json", "w") as f: json.dump(params, f) if is_make_feature_factory: # feature factory feature_factory_dict = {"user_id": {}} feature_factory_dict["user_id"][ "DurationPreviousContent"] = DurationPreviousContent( is_partial_fit=True) feature_factory_dict["user_id"][ "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder() feature_factory_manager = FeatureFactoryManager( feature_factory_dict=feature_factory_dict, logger=logger, split_num=1, model_id="all", load_feature=not is_debug, save_feature=not is_debug) ff_for_transformer = FeatureFactoryForTransformer( column_config=column_config, dict_path="../feature_engineering/", sequence_length=params["max_seq"], logger=logger) df = pd.read_pickle( "../input/riiid-test-answer-prediction/train_merged.pickle") if is_debug: df = df.head(10000) df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True) feature_factory_manager.fit(df) df = feature_factory_manager.all_predict(df) for dicts in feature_factory_manager.feature_factory_dict.values(): for factory in dicts.values(): factory.logger = None feature_factory_manager.logger = None with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f: pickle.dump(feature_factory_manager, f) ff_for_transformer.fit(df) ff_for_transformer.logger = None with open( f"{output_dir}/feature_factory_manager_for_transformer.pickle", "wb") as f: pickle.dump(ff_for_transformer, f)
def train(self, train_dataset, output_dir, show_running_loss=True): """ Trains the model on train_dataset. Utility function to be used by the train_model() method. Not intended to be used directly. """ tokenizer = self.tokenizer device = self.device model = self.model args = self.args tb_writer = SummaryWriter() train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"]) t_total = len(train_dataloader) // args["gradient_accumulation_steps"] * args["num_train_epochs"] no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ {"params": [p for n, p in model.named_parameters() if not any( nd in n for nd in no_decay)], "weight_decay": args["weight_decay"]}, {"params": [p for n, p in model.named_parameters() if any( nd in n for nd in no_decay)], "weight_decay": 0.0} ] warmup_steps = math.ceil(t_total * args["warmup_ratio"]) args["warmup_steps"] = warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"] optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"]) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args["warmup_steps"], t_total=t_total) if args["fp16"]: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"]) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch") for _ in train_iterator: # epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(tqdm(train_dataloader, desc="Current iteration")): model.train() batch = tuple(t.to(device) for t in batch) inputs = self._get_inputs_dict(batch) outputs = model(**inputs) # model outputs are always tuple in pytorch-transformers (see doc) loss = outputs[0] if show_running_loss: print("\rRunning loss: %f" % loss, end="") if args["gradient_accumulation_steps"] > 1: loss = loss / args["gradient_accumulation_steps"] if args["fp16"]: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args["max_grad_norm"]) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"]) tr_loss += loss.item() if (step + 1) % args["gradient_accumulation_steps"] == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0: # Log metrics # Only evaluate when single GPU otherwise metrics may not average well tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss)/args["logging_steps"], global_step) logging_loss = tr_loss if args["save_steps"] > 0 and global_step % args["save_steps"] == 0: # Save model checkpoint output_dir = os.path.join(output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, "module") else model model_to_save.save_pretrained(output_dir) return global_step, tr_loss / global_step
def train(args): random.seed(12345) np.random.seed(12345) torch.manual_seed(12345) if args.distributed: torch.cuda.manual_seed_all(12345) if args.distributed: assert args.bsize % args.nranks == 0, (args.bsize, args.nranks) assert args.accumsteps == 1 args.bsize = args.bsize // args.nranks print("Using args.bsize =", args.bsize, "(per process) and args.accumsteps =", args.accumsteps) if args.lazy: reader = LazyBatcher(args, (0 if args.rank == -1 else args.rank), args.nranks) else: reader = EagerBatcher(args, (0 if args.rank == -1 else args.rank), args.nranks) if args.rank not in [-1, 0]: torch.distributed.barrier() colbert = ColBERT.from_pretrained('bert-base-uncased', query_maxlen=args.query_maxlen, doc_maxlen=args.doc_maxlen, dim=args.dim, similarity_metric=args.similarity, mask_punctuation=args.mask_punctuation) if args.checkpoint is not None: assert args.resume_optimizer is False, "TODO: This would mean reload optimizer too." print_message( f"#> Starting from checkpoint {args.checkpoint} -- but NOT the optimizer!" ) checkpoint = torch.load(args.checkpoint, map_location='cpu') try: colbert.load_state_dict(checkpoint['model_state_dict']) except: print_message("[WARNING] Loading checkpoint with strict=False") colbert.load_state_dict(checkpoint['model_state_dict'], strict=False) if args.rank == 0: torch.distributed.barrier() colbert = colbert.to(DEVICE) colbert.train() if args.distributed: colbert = torch.nn.parallel.DistributedDataParallel( colbert, device_ids=[args.rank], output_device=args.rank, find_unused_parameters=True) optimizer = AdamW(filter(lambda p: p.requires_grad, colbert.parameters()), lr=args.lr, eps=1e-8) optimizer.zero_grad() amp = MixedPrecisionManager(args.amp) criterion = nn.CrossEntropyLoss() labels = torch.zeros(args.bsize, dtype=torch.long, device=DEVICE) start_time = time.time() train_loss = 0.0 start_batch_idx = 0 if args.resume: assert args.checkpoint is not None start_batch_idx = checkpoint['batch'] reader.skip_to_batch(start_batch_idx, checkpoint['arguments']['bsize']) for batch_idx, BatchSteps in zip(range(start_batch_idx, args.maxsteps), reader): this_batch_loss = 0.0 for queries, passages in BatchSteps: with amp.context(): scores = colbert(queries, passages).view(2, -1).permute(1, 0) loss = criterion(scores, labels[:scores.size(0)]) loss = loss / args.accumsteps if args.rank < 1: print_progress(scores) amp.backward(loss) train_loss += loss.item() this_batch_loss += loss.item() amp.step(colbert, optimizer) if args.rank < 1: avg_loss = train_loss / (batch_idx + 1) num_examples_seen = (batch_idx - start_batch_idx) * args.bsize * args.nranks elapsed = float(time.time() - start_time) log_to_mlflow = (batch_idx % 20 == 0) Run.log_metric('train/avg_loss', avg_loss, step=batch_idx, log_to_mlflow=log_to_mlflow) Run.log_metric('train/batch_loss', this_batch_loss, step=batch_idx, log_to_mlflow=log_to_mlflow) Run.log_metric('train/examples', num_examples_seen, step=batch_idx, log_to_mlflow=log_to_mlflow) Run.log_metric('train/throughput', num_examples_seen / elapsed, step=batch_idx, log_to_mlflow=log_to_mlflow) print_message(batch_idx, avg_loss) manage_checkpoints(args, colbert, optimizer, batch_idx + 1)