def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: rand_scores, nmi_scores, ami_scores = [], [], [] model_device = model.device if not self.use_model_device: model.cpu() for i in trange(len(self.passages), desc="Evaluating on val", smoothing=0.05): passages_to_cluster = [self.passages[i][p] for p in range(len(self.passages[i])) if len(self.passages[i][p])>0] true_label = self.labels[i][:len(passages_to_cluster)] doc_features = model.tokenize(passages_to_cluster) if self.use_model_device: batch_to_device(doc_features, model_device) doc_embeddings = model(doc_features)['sentence_embedding'] embeddings_dist_mat = self.euclid_dist(doc_embeddings) cl = AgglomerativeClustering(n_clusters=torch.unique(true_label).numel(), affinity='precomputed', linkage='average') cluster_label = cl.fit_predict(embeddings_dist_mat.detach().cpu().numpy()) rand_scores.append(adjusted_rand_score(true_label.numpy(), cluster_label)) nmi_scores.append(normalized_mutual_info_score(true_label.numpy(), cluster_label)) ami_scores.append(adjusted_mutual_info_score(true_label.numpy(), cluster_label)) mean_rand = np.mean(np.array(rand_scores)) mean_nmi = np.mean(np.array(nmi_scores)) mean_ami = np.mean(np.array(ami_scores)) print("\nRAND: %.5f, NMI: %.5f, AMI: %.5f\n" % (mean_rand, mean_nmi, mean_ami), flush=True) if not self.use_model_device: model.to(model_device) return mean_rand
def evaluate_language_pair(model, pair_name="cmn-eng", batch_size=32): lang_1, lang_2 = pair_name.split("-") reader_1 = TatoebaReader(TATOEBA_PATH / f"tatoeba.{pair_name}.{lang_1}") ds_1 = SentencesDataset(reader_1.get_examples(), model=model) loader_1 = DataLoader(ds_1, shuffle=False, batch_size=batch_size, collate_fn=model.smart_batching_collate) reader_2 = TatoebaReader(TATOEBA_PATH / f"tatoeba.{pair_name}.{lang_2}") ds_2 = SentencesDataset(reader_2.get_examples(), model=model) loader_2 = DataLoader(ds_2, shuffle=False, batch_size=batch_size, collate_fn=model.smart_batching_collate) model.eval() emb_1, emb_2 = [], [] with torch.no_grad(): for batch in loader_1: emb_1.append( model(batch_to_device(batch, "cuda")[0][0])['sentence_embedding']) for batch in loader_2: emb_2.append( model(batch_to_device(batch, "cuda")[0][0])['sentence_embedding']) emb_1 = torch.cat(emb_1).cpu().numpy() emb_2 = torch.cat(emb_2).cpu().numpy() idx_1 = faiss.IndexFlatL2(emb_1.shape[1]) faiss.normalize_L2(emb_1) idx_1.add(emb_1) idx_2 = faiss.IndexFlatL2(emb_2.shape[1]) faiss.normalize_L2(emb_2) idx_2.add(emb_2) results = [] _, match = idx_2.search(x=emb_1, k=1) results.append((lang_1, lang_2, np.sum(match[:, 0] == np.arange(len(emb_1))), len(emb_1))) _, match = idx_1.search(x=emb_2, k=1) results.append((lang_2, lang_1, np.sum(match[:, 0] == np.arange(len(emb_2))), len(emb_2))) return results
def _eval_loss(self, evaluation_loss, eval_dataloader, tb_writer, global_step): """evalution on cos similarity w.r.t STS.""" eval_dataloader.collate_fn = self.smart_batching_collate loss_value = 0 with torch.no_grad(): for idx, batch_cur in enumerate(eval_dataloader): features, labels = batch_to_device(batch_cur, self.device) loss_value += evaluation_loss(features, labels) loss_value_per_batch = float(loss_value/idx) tb_writer.add_scalar("eval/loss", loss_value_per_batch, global_step)
def do_test(pt_file, model_name, n): text = [] i = 0 with open(pt_file, 'r', encoding='utf8') as f: for l in f: text.append(l.split('\t')[1]) i += 1 if i >= n: break psg_word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector psg_pooling_model = models.Pooling( psg_word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) psg_dense_model = models.Dense( in_features=psg_pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh()) psg_model = CustomSentenceTransformer( modules=[psg_word_embedding_model, psg_pooling_model, psg_dense_model]) if torch.cuda.is_available(): psg_model.to(torch.device('cuda')) psg_features = [] print('Tokenizing') for p in text: psg_tkn = psg_model.tokenize(p) if torch.cuda.is_available(): batch_to_device(psg_tkn, torch.device('cuda')) psg_features.append(psg_tkn) psg_embs = [] print('Embedding') for pfet in psg_features: psg_emb = psg_model(pfet)['sentence_embedding'] psg_emb.to(torch.device('cpu')) psg_embs.append(psg_emb) print(psg_embs[:10])
def query_batch_collate_fn(self, batch): num_texts = len(batch[0].texts) queries = [] texts = [[] for _ in range(num_texts)] labels = [] for example in batch: queries.append(example.q_context) for idx, text in enumerate(example.texts): texts[idx].append(text) labels.append(example.label) labels = torch.tensor(labels).to(self.device) q_tokenized = self.query_model.tokenize(queries) batch_to_device(q_tokenized, self.device) psg_features = [] for idx in range(num_texts): p_tokenized = self.psg_model.tokenize(texts[idx]) batch_to_device(p_tokenized, self.device) psg_features.append(p_tokenized) return q_tokenized, psg_features, labels
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: model.eval() total = 0 correct = 0 if epoch != -1: if steps == -1: out_txt = " after epoch {}:".format(epoch) else: out_txt = " in epoch {} after {} steps:".format(epoch, steps) else: out_txt = ":" logging.info("Evaluation on the " + self.name + " dataset" + out_txt) self.dataloader.collate_fn = model.smart_batching_collate for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")): features, label_ids = batch_to_device(batch, self.device) with torch.no_grad(): _, prediction = self.softmax_model(features, labels=None) total += prediction.size(0) correct += torch.argmax(prediction, dim=1).eq(label_ids).sum().item() accuracy = correct / total logging.info("Accuracy: {:.4f} ({}/{})\n".format( accuracy, correct, total)) if output_path is not None: csv_path = os.path.join(output_path, self.csv_file) if not os.path.isfile(csv_path): with open(csv_path, mode="w", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(self.csv_headers) writer.writerow([epoch, steps, accuracy]) else: with open(csv_path, mode="a", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow([epoch, steps, accuracy]) return accuracy
def __call__(self, model, output_path, epoch=-1, steps=-1): model.eval() self.dataloader.collate_fn = model.smart_batching_collate embeddings = [] labels = [] for step, batch in enumerate(self.dataloader): features, batch_labels = batch_to_device(batch, self.device) with torch.no_grad(): emb1 = model( features[0])['sentence_embedding'].to("cpu").numpy() labels.extend(batch_labels.to("cpu").numpy()) embeddings.extend(emb1) embeddings = np.asarray(embeddings) labels = np.asarray(labels) mse = ((embeddings - labels)**2).mean() logging.info("MSE evaluation on " + self.name + " dataset") mse *= 100 logging.info("embeddings shape:\t" + str(embeddings.shape)) logging.info("MSE (*100):\t{:4f}".format(mse)) if output_path is not None: csv_path = os.path.join(output_path, self.csv_file) output_file_exists = os.path.isfile(csv_path) with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: writer = csv.writer(f) if not output_file_exists: writer.writerow(self.csv_headers) writer.writerow([epoch, steps, mse]) return -mse #Return negative score as SentenceTransformers maximizes the performance
def fit(self, train_objectives: Iterable[Tuple[DataLoader, nn.Module]], evaluator: SentenceEvaluator = None, epochs: int = 1, steps_per_epoch=None, scheduler: str = 'WarmupLinear', warmup_steps: int = 10000, optimizer_class: Type[Optimizer] = transformers.AdamW, optimizer_params: Dict[str, object] = { 'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False }, weight_decay: float = 0.01, evaluation_steps: int = 0, output_path: str = None, save_best_model: bool = True, max_grad_norm: float = 1, use_amp: bool = False, callback: Callable[[float, int, int], None] = None, output_path_ignore_not_empty: bool = False): """ Train the model with the given training objective Each training objective is sampled in turn for one batch. We sample only as many batches from each objective as there are in the smallest one to make sure of equal training with each dataset. :param train_objectives: Tuples of (DataLoader, LossFunction). Pass more than one for multi-task learning :param evaluator: An evaluator (sentence_transformers.evaluation) evaluates the model performance during training on held-out dev data. It is used to determine the best model that is saved to disc. :param epochs: Number of epochs for training :param steps_per_epoch: Number of training steps per epoch. If set to None (default), one epoch is equal the DataLoader size from train_objectives. :param scheduler: Learning rate scheduler. Available schedulers: constantlr, warmupconstant, warmuplinear, warmupcosine, warmupcosinewithhardrestarts :param warmup_steps: Behavior depends on the scheduler. For WarmupLinear (default), the learning rate is increased from o up to the maximal learning rate. After these many training steps, the learning rate is decreased linearly back to zero. :param optimizer_class: Optimizer :param optimizer_params: Optimizer parameters :param weight_decay: Weight decay for model parameters :param evaluation_steps: If > 0, evaluate the model using evaluator after each number of training steps :param output_path: Storage path for the model and evaluation files :param save_best_model: If true, the best model (according to evaluator) is stored at output_path :param max_grad_norm: Used for gradient normalization. :param use_amp: Use Automatic Mixed Precision (AMP). Only for Pytorch >= 1.6.0 :param callback: Callback function that is invoked after each evaluation. It must accept the following three parameters in this order: `score`, `epoch`, `steps` :param output_path_ignore_not_empty: deprecated, no longer used """ if use_amp: from torch.cuda.amp import autocast scaler = torch.cuda.amp.GradScaler() self.to(self._target_device) if output_path is not None: os.makedirs(output_path, exist_ok=True) dataloaders = [dataloader for dataloader, _ in train_objectives] # Use smart batching for dataloader in dataloaders: dataloader.collate_fn = self.smart_batching_collate loss_models = [loss for _, loss in train_objectives] for loss_model in loss_models: loss_model.to(self._target_device) self.best_score = -9999999 if steps_per_epoch is None or steps_per_epoch == 0: steps_per_epoch = min( [len(dataloader) for dataloader in dataloaders]) num_train_steps = int(steps_per_epoch * epochs) # Prepare optimizers optimizers = [] schedulers = [] for loss_model in loss_models: param_optimizer = list(loss_model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params) scheduler_obj = self._get_scheduler(optimizer, scheduler=scheduler, warmup_steps=warmup_steps, t_total=num_train_steps) optimizers.append(optimizer) schedulers.append(scheduler_obj) global_step = 0 data_iterators = [iter(dataloader) for dataloader in dataloaders] num_train_objectives = len(train_objectives) skip_scheduler = False for epoch in trange(epochs, desc="Epoch"): training_steps = 0 for loss_model in loss_models: loss_model.zero_grad() loss_model.train() for _ in trange(steps_per_epoch, desc="Iteration", smoothing=0.05): for train_idx in range(num_train_objectives): loss_model = loss_models[train_idx] optimizer = optimizers[train_idx] scheduler = schedulers[train_idx] data_iterator = data_iterators[train_idx] try: data = next(data_iterator) except StopIteration: #logging.info("Restart data_iterator") data_iterator = iter(dataloaders[train_idx]) data_iterators[train_idx] = data_iterator data = next(data_iterator) features, labels = batch_to_device(data, self._target_device) if use_amp: with autocast(): loss_value = loss_model(features, labels) scale_before_step = scaler.get_scale() scaler.scale(loss_value).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(loss_model.parameters(), max_grad_norm) scaler.step(optimizer) scaler.update() skip_scheduler = scaler.get_scale( ) != scale_before_step else: loss_value = loss_model(features, labels) loss_value.backward() torch.nn.utils.clip_grad_norm_(loss_model.parameters(), max_grad_norm) optimizer.step() optimizer.zero_grad() if not skip_scheduler: scheduler.step() training_steps += 1 global_step += 1 if evaluation_steps > 0 and training_steps % evaluation_steps == 0: self._eval_during_training(evaluator, output_path, save_best_model, epoch, training_steps, callback) for loss_model in loss_models: loss_model.zero_grad() loss_model.train() self._eval_during_training(evaluator, output_path, save_best_model, epoch, -1, callback)
def train(args, train_dataset, model, train_loss, dev_dataset=None): args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, shuffle=False) dev_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=args.train_batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) train_objectives = [(train_dataloader, train_loss)] epochs = args.epochs # evaluation_steps = 1000 output_path = args.output_dir optimizer_class = transformers.AdamW optimizer_params = { 'lr': args.learning_rate, 'eps': 1e-6, 'correct_bias': False } max_grad_norm = 1 # local_rank = -1 save_epoch = True dataloaders = [dataloader for dataloader, _ in train_objectives] # Use smart batching for dataloader in dataloaders: dataloader.collate_fn = model.smart_batching_collate loss_models = [loss for _, loss in train_objectives] logging.info('number of models is {} '.format(len(loss_models))) # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") for loss_model in loss_models: loss_model.to(args.device) model.best_score = -9999 min_batch_size = min([len(dataloader) for dataloader in dataloaders]) num_train_steps = int(min_batch_size * epochs) warmup_steps = math.ceil( len(train_dataset) * args.epochs / args.train_batch_size * 0.1) # 10% of train data for warm-up # Prepare optimizers optimizers = [] schedulers = [] for loss_model in loss_models: param_optimizer = list(loss_model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // args.world_size optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params) scheduler = model._get_scheduler(optimizer, scheduler='WarmupLinear', warmup_steps=warmup_steps, t_total=t_total) optimizers.append(optimizer) schedulers.append(scheduler) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) for idx in range(len(loss_models)): model2, optimizer2 = amp.initialize(loss_models[idx], optimizers[idx], opt_level=args.fp16_opt_level) loss_models[idx] = model2 optimizers[idx] = optimizer2 # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: for idx, loss_model in enumerate(loss_models): loss_models[idx] = torch.nn.parallel.DistributedDataParallel( loss_model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) logger.info('Setting Dist Paralel rank:{}'.format(args.local_rank)) elif args.n_gpu > 1: for idx, loss_model in enumerate(loss_models): loss_models[idx] = torch.nn.parallel.DataParallel(loss_model) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Total optimization steps = %d", t_total) global_step = 0 data_iterators = [iter(dataloader) for dataloader in dataloaders] num_train_objectives = len(train_objectives) # set_seeds(1,args) tr_loss = 0.0 for epoch in trange(epochs, desc="Epoch", disable=args.local_rank not in [-1, 0]): training_steps = 0 for loss_model in loss_models: loss_model.zero_grad() loss_model.train() for step in trange(num_train_objectives * min_batch_size, desc="Iteration", disable=args.local_rank not in [-1, 0]): idx = step % num_train_objectives loss_model = loss_models[idx] optimizer = optimizers[idx] scheduler = schedulers[idx] data_iterator = data_iterators[idx] try: data = next(data_iterator) except StopIteration: logging.info("Restart data_iterator") data_iterator = iter(dataloaders[idx]) data_iterators[idx] = data_iterator data = next(data_iterator) features, labels = batch_to_device(data, args.device) loss_value = loss_model(features, labels) # logger.info("loss size: {} ".format(str(len(loss_value)))) # logger.info("loss: ", loss_value) if args.n_gpu > 1: loss_value = loss_value.mean() if args.fp16: with amp.scale_loss(loss_value, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm) else: loss_value.backward() torch.nn.utils.clip_grad_norm_(loss_model.parameters(), max_grad_norm) training_steps += 1 tr_loss += loss_value.item() optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 if args.evaluation_steps > 0 and training_steps % args.evaluation_steps == 0 and evaluator != None: score = evaluator(model) for loss_model in loss_models: loss_model.zero_grad() loss_model.train() logging.info("Dev cosine-Similarity MSE:", score[0], " at step ", global_step) logging.info("Dev cosine-Similarity MAE:", score[1], " at step ", global_step) if args.local_rank in [-1, 0]: model.save(output_path + "_step_" + str(global_step)) if args.local_rank in [-1, 0] and save_epoch: model.save(output_path + "_ep_" + str(epoch)) return tr_loss / global_step
def fit(self, args, train_objectives: Iterable[Tuple[DataLoader, nn.Module]], eval_objectives: Iterable[Tuple[DataLoader, nn.Module]], train_evaluator: SentenceEvaluator, evaluator: SentenceEvaluator, train_phase: str = 'STS', epochs: int = 1, steps_per_epoch = None, scheduler_name: str = 'WarmupLinear', warmup_steps: int = 10000, optimizer_class: Type[Optimizer] = transformers.AdamW, optimizer_params : Dict[str, object ]= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False}, weight_decay: float = 0.01, evaluation_steps: int = 0, save_best_model: bool = True, max_grad_norm: float = 1, fp16: bool = False, fp16_opt_level: str = 'O1', local_rank: int = -1 ): if train_phase not in ['STS', 'NLI']: assert False, print(f"Not valid train_phase given.") self.lr = optimizer_params['lr'] self.desc_string = f'lr-{self.lr}_epochs-{epochs}_warmup_steps-{warmup_steps}' logger.info(f"model description is {self.desc_string}.") args.desc_string = self.desc_string if args.output_dir is not None: # empty folder is not necessary. os.makedirs(args.output_dir, exist_ok=True) path_prefix = args.output_dir.split('/')[0] tb_writer = SummaryWriter(log_dir=os.path.join(path_prefix, self.desc_string)) tb_writer.add_text('experiment args', self.desc_string, 0) dataloaders = [dataloader for dataloader, _ in train_objectives] # Use smart batching for dataloader in dataloaders: dataloader.collate_fn = self.smart_batching_collate ## GX: this design is for the composite loss. loss_models = [loss for _, loss in train_objectives] eval_dataloader, evaluation_loss = eval_objectives[0] # the current version device = self.device for loss_model in loss_models: loss_model.to(device) self.best_score = -9999999 if steps_per_epoch is None or steps_per_epoch == 0: steps_per_epoch = min([len(dataloader) for dataloader in dataloaders]) num_train_steps = int(steps_per_epoch * epochs) # Prepare optimizer and schedule (linear warmup and decay) param_optimizer = list(loss_model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': weight_decay}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] t_total = num_train_steps if local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params) scheduler = self._get_scheduler(optimizer, scheduler=scheduler_name, warmup_steps=warmup_steps, t_total=t_total) # Config global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 data_iterators = [iter(dataloader) for dataloader in dataloaders] # Check if continuing training from a checkpoint if args.should_continue: if self.loading_model_dir and os.path.exists(self.loading_model_dir): optimizer.load_state_dict(torch.load(os.path.join(self.loading_model_dir, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(self.loading_model_dir, "scheduler.pt"))) global_step, epochs_trained, steps_trained_in_current_epoch = self.load(self.loading_model_dir, steps_per_epoch) else: logger.info(" Starting fine-tuning.") # Train ! for epoch in trange(epochs_trained, epochs, desc="Epoch"): training_steps = 0 # training steps per epoch. loss_model.zero_grad() loss_model.train() data_iterator = data_iterators[0] for _ in trange(steps_per_epoch, desc="Iteration", smoothing=0.05): try: data = next(data_iterator) except StopIteration: data_iterator = iter(dataloaders[0]) data_iterators[0] = data_iterator data = next(data_iterator) if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 training_steps += 1 continue features, labels = batch_to_device(data, self.device) loss_value = loss_model(features, labels) tb_writer.add_scalar("progress/lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("progress/steps_per_epoch", steps_per_epoch, global_step) tb_writer.add_scalar("progress/num_train_steps", num_train_steps, global_step) tb_writer.add_scalar("train/loss_value", loss_value, global_step) if fp16: with amp.scale_loss(loss_value, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm) else: loss_value.backward() torch.nn.utils.clip_grad_norm_(loss_model.parameters(), max_grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() training_steps += 1 global_step += 1 args.global_step = global_step if evaluation_steps > 0 and training_steps % evaluation_steps == 0: criterion = train_evaluator(self, output_path=args.output_dir, epoch=epoch, steps=global_step) if train_phase == 'NLI': tb_writer.add_scalar("train/accuracy", criterion, global_step) elif train_phase == 'STS': tb_writer.add_scalar("train/score", criterion, global_step) current_file_name = 'checkpoint-'+str(global_step) rotate_checkpoints(args, checkpoint_prefix="checkpoint") self.save(current_file_name, args, optimizer, scheduler) self._eval_loss(evaluation_loss, eval_dataloader, tb_writer, global_step) if self._eval_during_training_custom(evaluator, args.output_dir, tb_writer, epoch, training_steps, global_step): self._save_the_best_checkpoint(args, global_step, optimizer, scheduler) loss_model.zero_grad() loss_model.train() self._eval_loss(evaluation_loss, eval_dataloader, tb_writer, global_step) if self._eval_during_training_custom(evaluator, args.output_dir, tb_writer, epoch, -1, global_step): self._save_the_best_checkpoint(args, global_step, optimizer, scheduler) ## write results into pkl files. write_result(args, self.best_score)
def train(args, train_dataset, model, train_loss): args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) #train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = SampleGenerator(train_dataset, sample_count=args.train_batch_size) train_objectives = [(train_dataloader, train_loss)] epochs = args.epochs output_path = args.output_dir optimizer_class = transformers.AdamW optimizer_params = { 'lr': args.learning_rate, 'eps': 1e-6, 'correct_bias': False } max_grad_norm = 1 # local_rank = -1 save_epoch = True dataloaders = [dataloader for dataloader, _ in train_objectives] # Use smart batching for dataloader in dataloaders: dataloader.collate_fn = model.smart_batching_collate loss_models = [loss for _, loss in train_objectives] logging.info('number of models is {} '.format(len(loss_models))) # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") for loss_model in loss_models: loss_model.to(args.device) model.best_score = -9999 min_batch_size = min([len(dataloader) for dataloader in dataloaders]) num_train_steps = int(min_batch_size * epochs) warmup_steps = math.ceil( len(train_dataset) * args.epochs / args.train_batch_size * 0.1) # 10% of train data for warm-up # Prepare optimizers optimizers = [] schedulers = [] for loss_model in loss_models: param_optimizer = list(loss_model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // args.world_size optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params) scheduler = model._get_scheduler(optimizer, scheduler='WarmupLinear', warmup_steps=warmup_steps, t_total=t_total) optimizers.append(optimizer) schedulers.append(scheduler) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", epochs) logger.info(" Instantaneous batch size per TPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Total optimization steps = %d", t_total) global_step = 0 data_iterators = [iter(dataloader) for dataloader in dataloaders] num_train_objectives = len(train_objectives) # set_seeds(1,args) tr_loss = 0.0 for epoch in trange(epochs, desc="Epoch", disable=args.local_rank not in [-1, 0]): training_steps = 0 for loss_model in loss_models: loss_model.zero_grad() loss_model.train() for step in trange(num_train_objectives * min_batch_size, desc="Iteration", disable=args.local_rank not in [-1, 0]): idx = step % num_train_objectives loss_model = loss_models[idx] optimizer = optimizers[idx] scheduler = schedulers[idx] data_iterator = data_iterators[idx] try: data = next(data_iterator) except StopIteration: logging.info("Restart data_iterator") data_iterator = iter(dataloaders[idx]) data_iterators[idx] = data_iterator data = next(data_iterator) features, labels = batch_to_device(data, args.device) loss_value = loss_model(features, labels) loss_value.backward() torch.nn.utils.clip_grad_norm_(loss_model.parameters(), max_grad_norm) training_steps += 1 tr_loss += loss_value.item() #optimizer.step() xm.optimizer_step(optimizer, barrier=True) scheduler.step() optimizer.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and save_epoch: model.save(output_path + "_" + str(epoch)) return tr_loss / global_step
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: model.eval() total = 0 correct = 0 if epoch != -1: if steps == -1: out_txt = " after epoch {}:".format(epoch) else: out_txt = " in epoch {} after {} steps:".format(epoch, steps) else: out_txt = ":" logging.info("Evaluation on the " + self.name + " dataset" + out_txt) self.dataloader.collate_fn = model.smart_batching_collate pre_results = torch.tensor([], dtype=torch.int64).to(self.device) prf = torch.tensor([], dtype=torch.int64).to(self.device) labels = torch.tensor([], dtype=torch.int64).to(self.device) for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")): features, label_ids = batch_to_device(batch, self.device) with torch.no_grad(): _, prediction = self.softmax_model(features, labels=None) total += prediction.size(0) correct += torch.argmax(prediction, dim=1).eq(label_ids).sum().item() #pre = torch.argmax(prediction, dim=1) #prf = torch.cat((prf,pre)) #labels = torch.cat((labels,label_ids)) #correct += pre.eq(label_ids).sum().item() #if self.label_text: # pre_results = torch.cat((pre_results,pre), 0 ) #prf = prf.view(-1) #labels = labels.view(-1) #p = metrics.precision_score(labels.cpu(), prf.cpu(), average=None) #r = metrics.recall_score(labels.cpu(), prf.cpu(), average=None) #f = metrics.f1_score(labels.cpu(), prf.cpu(), average=None) #target_names = ['class 0', 'class 1', 'class 2', 'class 3', 'class 4', 'class 5'] #prf_result = metrics.classification_report(labels.cpu(), prf.cpu(), target_names=target_names) #print(p,r,f) #print(prf_result) accuracy = correct / total logging.info("Accuracy: {:.4f} ({}/{})\n".format( accuracy, correct, total)) print("Accuracy: {:.4f} ({}/{})\n".format(accuracy, correct, total)) if output_path is not None: csv_path = os.path.join(output_path, self.csv_file) if not os.path.isfile(csv_path): with open(csv_path, mode="w", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(self.csv_headers) writer.writerow([epoch, steps, accuracy]) else: with open(csv_path, mode="a", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow([epoch, steps, accuracy]) if self.label_text: pre_results = pre_results.cpu().numpy().tolist() for i in range(len(self.label_text)): self.label_text[i].insert(1, pre_results[i]) if output_path is not None: csv_path = os.path.join(output_path, self.csv_label_text) with open(csv_path, mode="w", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(self.label_text_headers) for element in self.label_text: writer.writerow(element) return accuracy
def fit(self, train_objectives: Iterable[Tuple[DataLoader, nn.Module]], evaluator: SentenceEvaluator, epochs: int = 1, eval_dataloader=None, steps_per_epoch=None, scheduler: str = 'WarmupLinear', warmup_steps: int = 10000, optimizer_class: Type[Optimizer] = transformers.AdamW, optimizer_params: Dict[str, object] = { 'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False }, weight_decay: float = 0.01, evaluation_steps: int = 0, output_path: str = None, save_best_model: bool = True, max_grad_norm: float = 1, fp16: bool = False, fp16_opt_level: str = 'O1', local_rank: int = -1): self.lr = optimizer_params['lr'] self.desc_string = f'lr-{self.lr}_epochs-{epochs}_warmup_steps-{warmup_steps}' print(f"model description is {self.desc_string}.") if output_path is not None: # empty folder is not necessary. os.makedirs(output_path, exist_ok=True) path_prefix = output_path.split('/')[0] tb_writer = SummaryWriter( log_dir=os.path.join(path_prefix, self.desc_string)) tb_writer.add_text('experiment args', self.desc_string, 0) dataloaders = [dataloader for dataloader, _ in train_objectives] # Use smart batching for dataloader in dataloaders: dataloader.collate_fn = self.smart_batching_collate ## GX: this design is for the composite loss. loss_models = [loss for _, loss in train_objectives] device = self.device for loss_model in loss_models: loss_model.to(device) self.best_score = -9999999 if steps_per_epoch is None or steps_per_epoch == 0: steps_per_epoch = min( [len(dataloader) for dataloader in dataloaders]) num_train_steps = int(steps_per_epoch * epochs) # Prepare optimizers w.r.t each model. optimizers = [] schedulers = [] for loss_model in loss_models: param_optimizer = list(loss_model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] t_total = num_train_steps if local_rank != -1: t_total = t_total // torch.distributed.get_world_size() optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params) scheduler_obj = self._get_scheduler(optimizer, scheduler=scheduler, warmup_steps=warmup_steps, t_total=t_total) optimizers.append(optimizer) schedulers.append(scheduler_obj) # Decides the data-type here. if fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) for train_idx in range(len(loss_models)): model, optimizer = amp.initialize(loss_models[train_idx], optimizers[train_idx], opt_level=fp16_opt_level) loss_models[train_idx] = model optimizers[train_idx] = optimizer global_step = 0 ## GX: only use iter, instead of for loop on the dataloader. data_iterators = [iter(dataloader) for dataloader in dataloaders] num_train_objectives = len(train_objectives) for epoch in trange(epochs, desc="Epoch"): training_steps = 0 for loss_model in loss_models: loss_model.zero_grad() loss_model.train() for _ in trange(steps_per_epoch, desc="Iteration", smoothing=0.05): for train_idx in range(num_train_objectives): loss_model = loss_models[train_idx] optimizer = optimizers[train_idx] scheduler = schedulers[train_idx] data_iterator = data_iterators[train_idx] try: data = next(data_iterator) except StopIteration: data_iterator = iter(dataloaders[train_idx]) data_iterators[train_idx] = data_iterator data = next(data_iterator) features, labels = batch_to_device(data, self.device) loss_value = loss_model(features, labels) tb_writer.add_scalar("progress/lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("progress/steps_per_epoch", steps_per_epoch, global_step) tb_writer.add_scalar("progress/num_train_steps", num_train_steps, global_step) tb_writer.add_scalar("train/loss_value", loss_value, global_step) if fp16: with amp.scale_loss(loss_value, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), max_grad_norm) else: loss_value.backward() torch.nn.utils.clip_grad_norm_(loss_model.parameters(), max_grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() training_steps += 1 global_step += 1 if evaluation_steps > 0 and training_steps % evaluation_steps == 0: self._eval_during_training_custom(evaluator, output_path, tb_writer, save_best_model, epoch, training_steps, global_step) for loss_model in loss_models: loss_model.zero_grad() loss_model.train() self._eval_during_training_custom(evaluator, output_path, tb_writer, save_best_model, epoch, -1, global_step)
def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: model.eval() embeddings1 = [] embeddings2 = [] labels = [] if epoch != -1: if steps == -1: out_txt = f" after epoch {epoch}:" else: out_txt = f" in epoch {epoch} after {steps} steps:" else: out_txt = ":" logging.info("Evaluation the model on " + self.name + " dataset" + out_txt) self.dataloader.collate_fn = model.smart_batching_collate for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")): features, label_ids = batch_to_device(batch, self.device) with torch.no_grad(): emb1, emb2 = [ model(sent_features)['sentence_embedding'].to( "cpu").numpy() for sent_features in features ] labels.extend(label_ids.to("cpu").numpy()) embeddings1.extend(emb1) embeddings2.extend(emb2) cosine_scores = 1 - paired_cosine_distances(embeddings1, embeddings2) manhattan_distances = paired_manhattan_distances( embeddings1, embeddings2) euclidean_distances = paired_euclidean_distances( embeddings1, embeddings2) # Ensure labels are just 0 or 1 for label in labels: assert (label == 0 or label == 1) labels = np.asarray(labels) cosine_acc, cosine_threshold = self.find_best_acc_and_threshold( cosine_scores, labels, True) manhattan_acc, manhatten_threshold = self.find_best_acc_and_threshold( manhattan_distances, labels, False) euclidean_acc, euclidean_threshold = self.find_best_acc_and_threshold( euclidean_distances, labels, False) logging.info( "Accuracy with Cosine-Similarity:\t{:.2f}\t(Threshold: {:.4f})". format(cosine_acc * 100, cosine_threshold)) logging.info( "Accuracy with Manhattan-Distance:\t{:.2f}\t(Threshold: {:.4f})". format(manhattan_acc * 100, manhatten_threshold)) logging.info( "Accuracy with Euclidean-Distance:\t{:.2f}\t(Threshold: {:.4f})\n". format(euclidean_acc * 100, euclidean_threshold)) if output_path is not None: csv_path = os.path.join(output_path, self.csv_file) if not os.path.isfile(csv_path): with open(csv_path, mode="w", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow(self.csv_headers) writer.writerow([ epoch, steps, cosine_acc, euclidean_acc, manhattan_acc ]) else: with open(csv_path, mode="a", encoding="utf-8") as f: writer = csv.writer(f) writer.writerow([ epoch, steps, cosine_acc, euclidean_acc, manhattan_acc ]) if self.main_similarity == SimilarityFunction.COSINE: return cosine_acc elif self.main_similarity == SimilarityFunction.EUCLIDEAN: return euclidean_acc elif self.main_similarity == SimilarityFunction.MANHATTAN: return manhattan_acc else: raise ValueError("Unknown main_similarity value")