def evaluate_treccar(model_path, test_art_qrels, test_top_qrels, test_hier_qrels, test_paratext, level): test_page_paras, test_rev_para_top, test_rev_para_hier = get_trec_dat( test_art_qrels, test_top_qrels, test_hier_qrels) test_len_paras = np.array( [len(test_page_paras[page]) for page in test_page_paras.keys()]) print('test mean paras: %.2f, std: %.2f, max paras: %.2f' % (np.mean(test_len_paras), np.std(test_len_paras), np.max(test_len_paras))) test_ptext_dict = get_paratext_dict(test_paratext) test_top_cluster_data = [] test_hier_cluster_data = [] max_num_doc_test = max( [len(test_page_paras[p]) for p in test_page_paras.keys()]) test_pages = list(test_page_paras.keys()) for i in trange(len(test_pages)): page = test_pages[i] paras = test_page_paras[page] paratexts = [test_ptext_dict[p] for p in paras] top_sections = list(set([test_rev_para_top[p] for p in paras])) top_labels = [top_sections.index(test_rev_para_top[p]) for p in paras] hier_sections = list(set([test_rev_para_hier[p] for p in paras])) hier_labels = [ hier_sections.index(test_rev_para_hier[p]) for p in paras ] query_text = ' '.join(page.split('enwiki:')[1].split('%20')) n = len(paras) paras = paras + ['dummy'] * (max_num_doc_test - n) paratexts = paratexts + [''] * (max_num_doc_test - n) top_labels = top_labels + [-1] * (max_num_doc_test - n) hier_labels = hier_labels + [-1] * (max_num_doc_test - n) test_top_cluster_data.append( InputTRECCARExample(qid=page, q_context=query_text, pids=paras, texts=paratexts, label=np.array(top_labels))) test_hier_cluster_data.append( InputTRECCARExample(qid=page, q_context=query_text, pids=paras, texts=paratexts, label=np.array(hier_labels))) print("Top-level datasets") print("Test instances: %5d" % len(test_top_cluster_data)) model = SentenceTransformer(model_path) if level == 'h': print('Evaluating hiererchical clusters') test_evaluator = ClusterEvaluator.from_input_examples( test_hier_cluster_data) model.evaluate(test_evaluator) else: print('Evaluating toplevel clusters') test_evaluator = ClusterEvaluator.from_input_examples( test_top_cluster_data) model.evaluate(test_evaluator)
def evaluate_ng20(model_path, test_cluster_data, gpu_eval): if torch.cuda.is_available(): print('CUDA is available') device = torch.device('cuda') else: print('Using CPU') device = torch.device('cpu') model = SentenceTransformer(model_path) model.to(device) test_evaluator = ClusterEvaluator.from_input_examples( test_cluster_data, gpu_eval) model.evaluate(test_evaluator)
def main(): model = SentenceTransformer('bert-base-nli-mean-tokens') sts_reader = STSDataReader('datasets/stsbenchmark') test_data = SentencesDataset( examples=sts_reader.get_examples('sts-test.csv'), model=model, dataset_cache_id='sts-eval') test_dataloader = DataLoader(test_data, shuffle=False, batch_size=16) evaluator = EmbeddingSimilarityEvaluator(test_dataloader) model.evaluate(evaluator)
def pretrained_model_score(self, model_name, expected_score): model = SentenceTransformer(model_name) sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' if not os.path.exists(sts_dataset_path): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) train_samples = [] dev_samples = [] test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: score = float( row['score']) / 5.0 # Normalize score to range 0 ... 1 inp_example = InputExample( texts=[row['sentence1'], row['sentence2']], label=score) if row['split'] == 'dev': dev_samples.append(inp_example) elif row['split'] == 'test': test_samples.append(inp_example) else: train_samples.append(inp_example) evaluator = EmbeddingSimilarityEvaluator.from_input_examples( test_samples, name='sts-test') score = model.evaluate(evaluator) * 100 print(model_name, "{:.2f} vs. exp: {:.2f}".format(score, expected_score)) assert score > expected_score or abs(score - expected_score) < 0.1
def train_sbert(model_name, model_save_path): batch_size = 16 nli_reader, sts_reader = load_dataset() train_num_labels = nli_reader.get_num_labels() # Use BERT for mapping tokens to embeddings word_embedding_model = models.BERT(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read AllNLI train dataset") train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) logging.info("Read STSbenchmark dev dataset") dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) # Configure the training num_epochs = 1 warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path ) model = SentenceTransformer(model_save_path) test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) evaluator = EmbeddingSimilarityEvaluator(test_dataloader) model.evaluate(evaluator)
def pretrained_model_score(self, model_name, expected_score): model = SentenceTransformer(model_name) sts_reader = STSDataReader('../examples/datasets/stsbenchmark') test_data = SentencesDataset( examples=sts_reader.get_examples("sts-test.csv"), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8) evaluator = EmbeddingSimilarityEvaluator(test_dataloader) score = model.evaluate(evaluator) * 100 print(model_name, "{:.2f} vs. exp: {:.2f}".format(score, expected_score)) assert abs(score - expected_score) < 0.1
def run(): test_file = config.TEST_FILE test_batch = config.TEST_BATCH_SIZE model_save_path = config.MODEL_SAVE_PATH dfs = pd.read_csv(test_file, sep='\t', names=['idx', 'sent1', 'sent2', 'label']) dfs['label'] = pd.to_numeric(dfs['label'], downcast='float') dataset_reader = dataset.Dataset() test_sent1, test_sent2, test_labels = dataset_reader.read(dfs) evaluator = evaluation.BinaryClassificationEvaluator( test_sent1, test_sent2, test_labels, batch_size=test_batch, show_progress_bar=True) model = SentenceTransformer(model_save_path) model.evaluate(evaluator)
def test_self(): sts_reader = Self_csv_DataReader('./self_dataset/') model_save_path = './output' dir_list = os.listdir(model_save_path) dir_list.sort(key=lambda fn: os.path.getmtime(model_save_path + '/' + fn)) model_save_path = os.path.join(model_save_path, dir_list[-1]) model_save_path = './output/training_nli_.-pretrained_model-bert-base-chinese-2020-07-30_15-59-13' model = SentenceTransformer(model_save_path) examples, label_text = sts_reader.get_examples("test.csv", _eval=True) test_data = SentencesDataset(examples=examples, model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=config.train_batch_size) evaluator = LabelAccuracyEvaluator( test_dataloader, softmax_model=Softmax_label(model=model, sentence_embedding_dimension=model. get_sentence_embedding_dimension(), num_labels=config.train_num_labels), label_text=label_text) model.evaluate(evaluator, output_path=model_save_path)
os.path.join(script_folder_path, args.sts_corpus)) for idx, target in enumerate(target_eval_files): output_filename_eval = os.path.join(script_folder_path, args.sts_corpus + target + "-test.csv") if args.whitening: evaluators[target[:5]].append( WhiteningEmbeddingSimilarityEvaluator.from_input_examples( sts_reader.get_examples(output_filename_eval), measure_data_num=target_eval_data_num[idx], embed_dim=args.embed_dim, name=target, main_similarity=SimilarityFunction.COSINE)) else: evaluators[target[:5]].append( EmbeddingSimilarityEvaluator.from_input_examples( sts_reader.get_examples(output_filename_eval), name=target, main_similarity=SimilarityFunction.COSINE)) all_results = [] logger_text = "" for task, sequential_evaluator in evaluators.items(): result = model.evaluate( SequentialEvaluator( sequential_evaluator, main_score_function=lambda scores: np.mean(scores))) logger_text += "%.2f \t" % (result * 100) all_results.append(result * 100) logger.info(" \t".join(target_eval_tasks) + " \tOverall.") logger.info(logger_text + "%.2f" % np.mean(all_results))
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help= "Path to pre-trained model or shortcut name selected in the list: ", ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument( "--max_seq_length", default=510, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.", ) parser.add_argument( "--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.", ) parser.add_argument( "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.", ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--do_train", action="store_true", help="Whether to run training.") parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.") args = parser.parse_args() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), ) #TODO: Prepare Dataloader patent_reader = PatentDataReader(args.data_dir, normalize_scores=True) model = SentenceTransformer(args.model_name_or_path) test_data = SentencesDataset(examples=patent_reader.get_examples( "dev.tsv", max_examples=40), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=args.per_gpu_train_batch_size) evaluator = EmbeddingSimilarityEvaluator(test_dataloader) model.evaluate(evaluator) # Convert the dataset to a DataLoader ready for training print("Read STSbenchmark train dataset") train_data = SentencesDataset( patent_reader.get_examples('train.tsv', max_examples=17714), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.per_gpu_train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_data = SentencesDataset( patent_reader.get_examples('train.tsv', max_examples=17714), model) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
parser.add_argument('--trained_model_path', type=str, default='./model_save') parser.add_argument('--output_dir', type=str, default='./performance/') parser.add_argument('--dataset', type=str, default='msrp', choices=['msrp', 'sts', 'atec', 'ccks', 'chsts']) parser.add_argument('--task_type', type=str, default='', choices=['classification', 'regression']) args = parser.parse_args() trained_model_path = args.trained_model_path output_dir = args.output_dir dataset = args.dataset task_type = args.task_type test_examples = ld.load_dataset(dataset_name=dataset, dataset_type='test') if task_type == "classification": evaluator = evaluation.BinaryClassificationEvaluator.from_input_examples( test_examples) else: evaluator = evaluation.EmbeddingSimilarityEvaluator.from_input_examples( test_examples) model = SentenceTransformer(trained_model_path) model.evaluate(evaluator, output_dir)
def fit(self, train_objectives: Iterable[Tuple[DataLoader, nn.Module]], evaluator: SentenceEvaluator = None, test_evaluator: SentenceEvaluator = None, epochs: int = 1, steps_per_epoch=None, scheduler: str = 'WarmupLinear', warmup_steps: int = 10000, optimizer_class: Type[Optimizer] = transformers.AdamW, optimizer_params: Dict[str, object] = {'lr': 2e-5}, weight_decay: float = 0.01, evaluation_steps: int = 0, output_path: str = None, save_best_model: bool = True, max_grad_norm: float = 1, use_amp: bool = False, callback: Callable[[float, int, int], None] = None, show_progress_bar: bool = True): tensorboard_writer = SummaryWriter('./tensorboard_logs') if use_amp: from torch.cuda.amp import autocast scaler = torch.cuda.amp.GradScaler() self.to(self._target_device) GPUtil.showUtilization() if output_path is not None: os.makedirs(output_path, exist_ok=True) dataloaders = [dataloader for dataloader, _ in train_objectives] # Use smart batching for dataloader in dataloaders: dataloader.collate_fn = self.smart_batching_collate loss_models = [loss for _, loss in train_objectives] for loss_model in loss_models: loss_model.to(self._target_device) self.best_score = -9999999 if steps_per_epoch is None or steps_per_epoch == 0: steps_per_epoch = min( [len(dataloader) for dataloader in dataloaders]) num_train_steps = int(steps_per_epoch * epochs) # Prepare optimizers optimizers = [] schedulers = [] for loss_model in loss_models: param_optimizer = list(loss_model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = optimizer_class(optimizer_grouped_parameters, **optimizer_params) scheduler_obj = self._get_scheduler(optimizer, scheduler=scheduler, warmup_steps=warmup_steps, t_total=num_train_steps) optimizers.append(optimizer) schedulers.append(scheduler_obj) global_step = 0 data_iterators = [iter(dataloader) for dataloader in dataloaders] num_train_objectives = len(train_objectives) skip_scheduler = False config = {'epochs': epochs, 'steps_per_epoch': steps_per_epoch} for epoch in trange(config.get('epochs'), desc="Epoch", disable=not show_progress_bar): training_steps = 0 running_loss_0 = 0.0 for loss_model in loss_models: loss_model.zero_grad() loss_model.train() for _ in trange(config.get('steps_per_epoch'), desc="Iteration", smoothing=0.05, disable=not show_progress_bar): for train_idx in range(num_train_objectives): loss_model = loss_models[train_idx] optimizer = optimizers[train_idx] scheduler = schedulers[train_idx] data_iterator = data_iterators[train_idx] try: data = next(data_iterator) except StopIteration: data_iterator = iter(dataloaders[train_idx]) data_iterators[train_idx] = data_iterator data = next(data_iterator) features, labels = data if use_amp: with autocast(): loss_value = loss_model(features, labels) scale_before_step = scaler.get_scale() scaler.scale(loss_value).backward() scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(loss_model.parameters(), max_grad_norm) scaler.step(optimizer) scaler.update() if train_idx == 0: running_loss_0 += loss_value.item() skip_scheduler = scaler.get_scale( ) != scale_before_step else: loss_value = loss_model(features, labels) if train_idx == 0: running_loss_0 += loss_value.item() loss_value.backward() torch.nn.utils.clip_grad_norm_(loss_model.parameters(), max_grad_norm) optimizer.step() optimizer.zero_grad() if not skip_scheduler: scheduler.step() training_steps += 1 global_step += 1 if evaluation_steps > 0 and training_steps % evaluation_steps == 0: tensorboard_writer.add_scalar( 'training_loss', running_loss_0 / evaluation_steps, global_step) #logger.report_scalar('Loss', 'training_loss', iteration=global_step, value=running_loss_0/evaluation_steps) running_loss_0 = 0.0 #self._eval_during_training(evaluator, output_path, save_best_model, epoch, training_steps, callback) if evaluator is not None: score = evaluator(self, output_path=output_path, epoch=epoch, steps=training_steps) tensorboard_writer.add_scalar('val_ARI', score, global_step) #logger.report_scalar('Training progress', 'val_ARI', iteration=global_step, value=score) if callback is not None: callback(score, epoch, training_steps) if score > self.best_score: self.best_score = score if save_best_model: print('Saving model at: ' + output_path) self.save(output_path) for loss_model in loss_models: loss_model.zero_grad() loss_model.train() #self._eval_during_training(evaluator, output_path, save_best_model, epoch, -1, callback) #tensorboard_writer.add_scalar('training_loss', running_loss_0 / evaluation_steps, global_step) #logger.report_scalar('Loss', 'training_loss', iteration=global_step, value=running_loss_0 / evaluation_steps) if evaluator is not None: score = evaluator(self, output_path=output_path, epoch=epoch, steps=training_steps) tensorboard_writer.add_scalar('val_ARI', score, global_step) #logger.report_scalar('Training progress', 'val_ARI', iteration=global_step, value=score) if callback is not None: callback(score, epoch, training_steps) if score > self.best_score: self.best_score = score if save_best_model: self.save(output_path) if test_evaluator is not None: best_model = SentenceTransformer(output_path) device = self.device if torch.cuda.is_available(): self.to(torch.device('cpu')) best_model.to(device) test_ari = best_model.evaluate(test_evaluator) best_model.to(torch.device('cpu')) self.to(device) else: test_ari = best_model.evaluate(test_evaluator) tensorboard_writer.add_scalar('test_ARI', test_ari, global_step) #logger.report_scalar('Training progress', 'test_ARI', iteration=global_step, value=test_ari) if evaluator is None and output_path is not None: # No evaluator, but output path: save final model version self.save(output_path)
train_loss = losses.BatchSemiHardTripletLoss(sentence_embedder=model) logging.info("Read TREC val dataset") dev_evaluator = TripletEvaluator.from_input_examples(dev_set, name='dev') logging.info("Performance before fine-tuning:") dev_evaluator(model) warmup_steps = int(len(train_dataset) * num_epochs / train_batch_size * 0.1) # 10% of train data # Train the model model.fit( train_objectives=[(train_dataloader, train_loss)], evaluator=dev_evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=output_path, ) ############################################################################## # # Load the stored model and evaluate its performance on TREC dataset # ############################################################################## logging.info("Evaluating model on test set") test_evaluator = TripletEvaluator.from_input_examples(test_set, name='test') model.evaluate(test_evaluator)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) # Configure the training. We skip evaluation in this example warmup_steps = math.ceil( len(train_data) * args.num_epochs / args.batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=args.num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=args.ckpt_path) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset # ############################################################################## model = SentenceTransformer(args.ckpt_path) test_data = SentencesDataset( examples=sts_reader.get_examples("sts-test_vi.csv"), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=args.batch_size) evaluator = EmbeddingSimilarityEvaluator(test_dataloader) model.evaluate(evaluator, args.ckpt_path)
model.add(tf.keras.layers.Dense(units, activation='relu')) # model.add(tf.keras.layers.Embedding(len(test_encoding), 64)) model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))) # model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))) # One or more dense layers. # Edit the list in the `for` line to experiment with layer sizes. for units in [64, 64]: model.add(tf.keras.layers.Dense(units, activation='relu')) # Output layer. The first argument is the number of labels. model.add(tf.keras.layers.Dense(1, activation='sigmoid')) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.TruePositives(name='tp'), tf.keras.metrics.FalsePositives(name='fp'), tf.keras.metrics.TrueNegatives(name='tn'), tf.keras.metrics.FalseNegatives(name='fn')]) model.fit(train_data, epochs=100, validation_data=test_data) loss, accuracy, tp, fp, tn, fn, = model.evaluate(test_data) print('Test Loss: {}'.format(loss)) print('Test Accuracy: {}'.format(accuracy)) print('Test TP: {}'.format(tp)) print('Test FP: {}'.format(fp)) print('Test TN: {}'.format(tn)) print('Test FN: {}'.format(fn)) # print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))
# Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, '../datasets/stsbenchmark')) sts_reader = STSBenchmarkDataReader(data_folder, s1_col_idx=0, s2_col_idx=1, score_col_idx=2, delimiter="\t", min_score=0, max_score=1) test_data = SentencesDataset( examples=sts_reader.get_examples("test_sts.tsv"), model=model, ) print("DataLoader") test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8) print("EmbeddingSimilarityEvaluator") evaluator = EmbeddingSimilarityEvaluator(test_dataloader, show_progress_bar=False) print(evaluator) # print(model) # print(model.evaluate) # exit(1) model.evaluate(evaluator, output_path)
class BertTrainer: """ Class to train NLI model :param logger: logger to use in model """ def __init__(self, logger: Logger, train_path: str, dev_path: str, test_path: str, base_model: str, batch_size: int, path_to_save: str, **kwargs): self.logger = logger self.logger.info("Models are loaded and ready to use.") self.train_path = train_path self.dev_path = dev_path self.test_path = test_path self.base_model = base_model self.batch_size = batch_size dataset = 'snli' if dataset == 'snli': self.label2int = { "contradiction": 0, "entailment": 1, "neutral": 2 } else: self.label2int = {"SUPPORTS": 1, "REFUTES": 0} self.path_to_save = path_to_save def initialize_model(self): # Read the dataset # Use BERT for mapping tokens to embeddings word_embedding_model = models.Transformer(self.base_model, max_seq_length=128) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) self.model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) self.train_loss_nli = losses.SoftmaxLoss( model=self.model, sentence_embedding_dimension=self.model. get_sentence_embedding_dimension(), num_labels=len(self.label2int)) def preparing_data(self): """ Method used for data preparation before training it reads data from files predefined in config and process them Uses for SNLI data format """ train_snli = _create_examples_snli(_read_tsv(self.train_path), 'train_s') dev_snli = _create_examples_snli(_read_tsv(self.dev_path), 'dev_s') test_snli = _create_examples_snli(_read_tsv(self.test_path), 'test_s') # Convert the dataset to a DataLoader ready for training self.logger.info("Read train dataset") train_nli_samples = [] dev_nli_samples = [] test_nli_samples = [] for row in tqdm(train_snli): label_id = self.label2int[row[3]] train_nli_samples.append( InputExample(guid=row[0], texts=[row[1], row[2]], label=label_id)) for row in tqdm(dev_snli): label_id = self.label2int[row[3]] dev_nli_samples.append( InputExample(guid=row[0], texts=[row[1], row[2]], label=label_id)) for row in tqdm(test_snli): label_id = self.label2int[row[3]] test_nli_samples.append( InputExample(guid=row[0], texts=[row[1], row[2]], label=label_id)) train_data_nli = SentencesDataset(train_nli_samples, model=self.model) self.train_dataloader_nli = DataLoader(train_data_nli, shuffle=True, batch_size=self.batch_size) dev_data_nli = SentencesDataset(dev_nli_samples, model=self.model) self.dev_dataloader_nli = DataLoader(dev_data_nli, shuffle=True, batch_size=self.batch_size) test_data_nli = SentencesDataset(test_nli_samples, model=self.model) self.test_dataloader_nli = DataLoader(test_data_nli, shuffle=True, batch_size=self.batch_size) def preparing_data_fever(self): """ Method used for data preparation before training it reads data from files predefined in config and process them Uses for FEVER SNLI-style data format """ def read_fever(path): df = pd.read_csv(path) df.dropna(inplace=True) df.reset_index(drop=True, inplace=True) return df train_snli = _create_examples_fever(read_fever(self.train_path), 'train_s') dev_snli = _create_examples_fever(read_fever(self.dev_path), 'dev_s') test_snli = _create_examples_fever(read_fever(self.test_path), 'test_s') # Convert the dataset to a DataLoader ready for training self.logger.info("Read train dataset") train_nli_samples = [] dev_nli_samples = [] test_nli_samples = [] for row in tqdm(train_snli): label_id = self.label2int[row[3]] train_nli_samples.append( InputExample(guid=row[0], texts=[row[1], row[2]], label=label_id)) for row in tqdm(dev_snli): label_id = self.label2int[row[3]] dev_nli_samples.append( InputExample(guid=row[0], texts=[row[1], row[2]], label=label_id)) for row in tqdm(test_snli): label_id = self.label2int[row[3]] test_nli_samples.append( InputExample(guid=row[0], texts=[row[1], row[2]], label=label_id)) train_data_nli = SentencesDataset(train_nli_samples, model=self.model) self.train_dataloader_nli = DataLoader(train_data_nli, shuffle=True, batch_size=self.batch_size) dev_data_nli = SentencesDataset(dev_nli_samples, model=self.model) self.dev_dataloader_nli = DataLoader(dev_data_nli, shuffle=True, batch_size=self.batch_size) test_data_nli = SentencesDataset(test_nli_samples, model=self.model) self.test_dataloader_nli = DataLoader(test_data_nli, shuffle=True, batch_size=self.batch_size) def preparing_data_mnli(self): """ Method used for data preparation before training it reads data from files predefined in config and process them Uses for MNLI data format """ def read_mnli(path): df = pd.read_table(path, error_bad_lines=False) df.sentence1 = df.sentence1.astype(str) df.sentence2 = df.sentence2.astype(str) df.gold_label = df.gold_label.astype(str) df = df[df.gold_label != '-'] df.dropna(inplace=True) return df train_snli = _create_examples_mnli(read_mnli(self.train_path), 'train_s') dev_snli = _create_examples_mnli(read_mnli(self.dev_path), 'dev_s') test_snli = _create_examples_mnli(read_mnli(self.test_path), 'test_s') # Convert the dataset to a DataLoader ready for training self.logger.info("Read train dataset") train_nli_samples = [] dev_nli_samples = [] test_nli_samples = [] print(len(train_snli)) for row in tqdm(train_snli): label_id = self.label2int[row[3]] train_nli_samples.append( InputExample(guid=row[0], texts=[row[1], row[2]], label=label_id)) for row in tqdm(dev_snli): label_id = self.label2int[row[3]] dev_nli_samples.append( InputExample(guid=row[0], texts=[row[1], row[2]], label=label_id)) for row in tqdm(test_snli): label_id = self.label2int[row[3]] test_nli_samples.append( InputExample(guid=row[0], texts=[row[1], row[2]], label=label_id)) print(len(train_nli_samples)) train_data_nli = SentencesDataset(train_nli_samples, model=self.model) self.train_dataloader_nli = DataLoader(train_data_nli, shuffle=True, batch_size=self.batch_size) dev_data_nli = SentencesDataset(dev_nli_samples, model=self.model) self.dev_dataloader_nli = DataLoader(dev_data_nli, shuffle=True, batch_size=self.batch_size) test_data_nli = SentencesDataset(test_nli_samples, model=self.model) self.test_dataloader_nli = DataLoader(test_data_nli, shuffle=True, batch_size=self.batch_size) def save_model(self): """ Method used for model saving """ torch.save(self.train_loss_nli.classifier.cpu(), self.path_to_save + 'classifier_model') self.model.save(self.path_to_save + "bert_model_trained") def load_model(self, text_model_path, classifier_path): """ Method used for pretrained model loading """ self.model = SentenceTransformer(text_model_path) self.classification_model = torch.load(classifier_path) self.train_loss_nli = losses.SoftmaxLoss( model=self.model, sentence_embedding_dimension=self.model. get_sentence_embedding_dimension(), num_labels=len(self.label2int)) self.train_loss_nli.classifier = self.classification_model def train_model(self, number_of_epochs=1): """ Method implements model training process """ warmup_steps = 10000 self.logger.info("Warmup-steps: {}".format(warmup_steps)) train_objectives = [(self.train_dataloader_nli, self.train_loss_nli)] validation_performance = [] test_performance = [] test_evaluator = LabelAccuracyEvaluator( self.test_dataloader_nli, name='nli_test', softmax_model=self.train_loss_nli) dev_evaluator = LabelAccuracyEvaluator( self.dev_dataloader_nli, name='nli_test', softmax_model=self.train_loss_nli) for i in range(number_of_epochs): self.model.fit(train_objectives=train_objectives) validation_performance.append(self.model.evaluate(dev_evaluator)) test_performance.append(self.model.evaluate(test_evaluator)) print(f'Iteration - {i + 1} ...') print(f'Validation performance - {validation_performance[-1]} ...') print(f'Test performance - {test_performance[-1]} ...') return validation_performance, test_performance
import numpy as np device = "cuda" model = SentenceTransformer( '/home/xstefan3/arqmath/compubert/out_whole_sampled_eval') clef_home_directory_file_path = '/home/xstefan3/arqmath/data/Collection' dr = DataReaderRecord(clef_home_directory_file_path) all_examples = list(examples_from_questions_tup(dr.post_parser.map_questions)) examples_len = len(all_examples) train_dev_test_split = (int(0.8 * examples_len), int(0.9 * examples_len)) # model = SentenceTransformer('/home/xstefan3/arqmath/compubert/out_whole', logfile="train_whole_sampled_eval.log") test_data = SentencesDataset(all_examples[train_dev_test_split[1]:], model, show_progress_bar=True) # test_sampler = RandomSampler(dev_data, replacement=True, num_samples=250) test_loader = DataLoader(test_data, batch_size=16) evaluator = EmbeddingSimilarityEvaluator(test_loader, show_progress_bar=True, device=device) test_val = model.evaluate(evaluator) print(test_val)
output_path = "output/bert-base-wikipedia-sections-mean-tokens" num_epochs = 1 warmup_steps = int(len(train_data)*num_epochs/train_batch_size/10) #10% of train data train_config = TrainConfig(learning_rate=2e-5, weight_decay=0.01, epochs=num_epochs, evaluation_steps=1000, output_path=output_path, save_best_model=True, evaluator=evaluator, warmup_steps=warmup_steps) embedder.train(dataloader=train_dataloader, train_config=train_config) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset # ############################################################################## embedder = SentenceTransformer(output_path) test_data =SentencesDataset(examples=triplet_reader.get_examples('test.csv'), model=embedder) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size, collate_fn=embedder.encoder.smart_batching_collate) evaluator = TripletEvaluator(test_dataloader) embedder.evaluate(evaluator)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) train_examples = [] test_examples = [] for index, row in corpus.iterrows(): train_examples.append(InputExample(texts=[row['sentence_A'], row['sentence_B']], label=row['relatedness_score'])) s3.append(row['sentence_A']) s3.append(row['sentence_B']) train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8) train_loss = losses.CosineSimilarityLoss(model) model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=5, warmup_steps=100, evaluator=evaluator, evaluation_steps=500) model.best_score #1epochs 0.9232 #5epochs 0.932 model.evaluate(evaluator) #0.9232 # model.save("roberta_base_CDS_train_biencoder") # ----------------------------------------------- # from sentence_transformers import CrossEncoder # model = CrossEncoder('roberta_base', max_length=256) # model.fit(train_dataloader, # epochs=1, warmup_steps=100) # scores = model.predict([[sentences1,sentences2 ],[sentences3,sentences2],[sentences1,sentences3 ]]) # #pretrained model 0.48104742, 0.48180264, 0.47577295 # #after training 0.26556703, 0.03470451, 0.03307376 # -------------------------------------------- sentences1 = 'Piłka nożna z wieloma grającymi facetami' sentences2 = 'Jacyś mężczyźni grają w futbol'
evaluation_steps=1000, warmup_steps=warmup_steps, output_path=bi_encoder_path, output_path_ignore_not_empty=True) ############################################################### # # Evaluate Augmented SBERT performance on QQP benchmark dataset # ############################################################### # Loading the augmented sbert model bi_encoder = SentenceTransformer(bi_encoder_path) logging.info("Read QQP test dataset") test_sentences1 = [] test_sentences2 = [] test_labels = [] with open(os.path.join(qqp_dataset_path, "classification/test_pairs.tsv"), encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: test_sentences1.append(row['question1']) test_sentences2.append(row['question2']) test_labels.append(int(row['is_duplicate'])) evaluator = BinaryClassificationEvaluator(test_sentences1, test_sentences2, test_labels) bi_encoder.evaluate(evaluator)
os.remove(os.path.join(curr_dir, "prediction_results.csv")) # Model path model_save_path = curr_dir batch_size = 24 agb_reader = TestAGBReader('datasets/og-test') train_num_labels = agb_reader.get_num_labels() model = SentenceTransformer(model_save_path, device="cpu") train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) train_loss.classifier = torch.load(os.path.join(model_save_path, "2_Softmax/pytorch_model.bin")) print("test") test_dir = "/data/daumiller/sentence-transformers/examples/datasets/og-test" for fn in sorted(os.listdir(test_dir)): examples = agb_reader.get_examples(fn) if not examples: continue # Hack to avoid problems with docs almost as long as batch size if len(examples) == batch_size + 1: batch_size_used = batch_size - 3 else: batch_size_used = batch_size test_data = SentencesDataset(examples=examples, model=model, shorten=True) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size_used) evaluator = LabelGenerationEvaluator(test_dataloader, softmax_model=train_loss) model.evaluate(evaluator, model_save_path)
def train(triplet_data_dir, output): logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) ### Create a torch.DataLoader that passes training batch instances to our model train_batch_size = 16 triplet_reader = TripletReader(triplet_data_dir, s1_col_idx=1, s2_col_idx=2, s3_col_idx=3, delimiter=',', quoting=csv.QUOTE_MINIMAL, has_header=True) # output_path = "output/bert-base-wikipedia-sections-mean-tokens-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") output_path = output + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") num_epochs = 1 ### Configure sentence transformers for training and train on the provided dataset # Use BERT for mapping tokens to embeddings word_embedding_model = models.BERT('bert-base-uncased') # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) logging.info("Read Triplet train dataset") train_data = SentencesDataset(examples=triplet_reader.get_examples( 'train.csv', 2000000), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.TripletLoss(model=model) logging.info("Read Wikipedia Triplet dev dataset") dev_data = SentencesDataset(examples=triplet_reader.get_examples( 'validation.csv', 10000), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator = TripletEvaluator(dev_dataloader) warmup_steps = int(len(train_data) * num_epochs / train_batch_size * 0.1) #10% of train data # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=output_path) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset # ############################################################################## model = SentenceTransformer(output_path) test_data = SentencesDataset( examples=triplet_reader.get_examples('test.csv'), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) evaluator = TripletEvaluator(test_dataloader) model.evaluate(evaluator)
model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) # Configure the training num_epochs = 10 warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, warmup_steps=warmup_steps, output_path=model_save_path) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset # ############################################################################## model = SentenceTransformer(model_save_path) test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) evaluator = EmbeddingSimilarityEvaluator(test_dataloader) model.evaluate(evaluator)
def train_nli(): #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base #model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased' model_name = 'pretrained_model/bert-base-uncased' # Read the dataset train_batch_size = 6 nli_reader = NLIDataReader('./examples/datasets/AllNLI') sts_reader = STSBenchmarkDataReader('./examples/datasets/stsbenchmark') train_num_labels = nli_reader.get_num_labels() model_save_path = 'output/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read AllNLI train dataset") train_dataset = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) logging.info("Read STSbenchmark dev dataset") dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator = LabelAccuracyEvaluator(dev_dataloader,softmax_model = Softmax_label(model = model, sentence_embedding_dimension = model.get_sentence_embedding_dimension(), num_labels = train_num_labels)) # Configure the training num_epochs = 1 warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=100, warmup_steps=warmup_steps, output_path=model_save_path ) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset # ############################################################################## #model = SentenceTransformer(model_save_path) test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) #evaluator = EmbeddingSimilarityEvaluator(test_dataloader) model.evaluate(evaluator)