def main(args): # Read the dataset args.output_dir = os.path.join(args.output_dir, f'training_nli_sts-sci_bert-num_epochs_{args.num_epochs}-bs_{args.batch_size}-lr_{args.learning_rate}') logger.info(f"the output directory is {args.output_dir}.") if avoid_duplicate(args): logger.info(f"the experiment is previously done.") sys.exit(0) if args.should_continue: sorted_checkpoints_list = sorted_checkpoints(args) if len(sorted_checkpoints_list) == 0: raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.") elif len(sorted_checkpoints_list) == 1: raise ValueError("In theory, the second last checkpoint should be read when using ephemeral.") else: args.sentence_model_path = sorted_checkpoints_list[-2] model = SentenceTransformer_tb(saved_scibert_model_path=args.sentence_model_path) # Load a pre-trained sentence transformer model # args.sentence_model_path has optimizer and scheduler else: model = SentenceTransformer_tb(saved_scibert_model_path=args.finetuned_model_path) sts_reader = STSBenchmarkDataReader(args.data_directory, normalize_scores=True) # Convert the dataset to a DataLoader ready for training and dev. train_data, train_loss, train_dataloader, train_evaluator = train_config(sts_reader, model, args.batch_size) dev_loss, dev_dataloader, dev_evaluator = dev_config(sts_reader, model, args.batch_size) # Configure the training. We skip evaluation in this example warmup_steps = math.ceil(len(train_data)*args.num_epochs/args.batch_size*0.1) #10% of train data for warm-up optimizer_params = {'lr': args.learning_rate, 'eps': 1e-6, 'correct_bias': False} logger.info(f"Warmup-steps: {warmup_steps}") # Train the model model.fit(args=args, train_objectives=[(train_dataloader, train_loss)], eval_objectives=[(dev_dataloader, dev_loss)], train_evaluator=train_evaluator, evaluator=dev_evaluator, train_phase='STS', epochs=args.num_epochs, evaluation_steps=50 if not args.debug_mode else 2, optimizer_params=optimizer_params, warmup_steps=warmup_steps)
#### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased' # Read the dataset train_batch_size = 16 num_epochs = 4 model_save_path = 'output/training_stsbenchmark_' + model_name.replace( "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark', normalize_scores=True) # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset")
import sys #print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) model_name = 'bert-base-uncased' # Read the dataset train_batch_size = 16 num_epochs = 4 model_save_path = 'output/training_stsbenchmark_' + model_name.replace( "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") sts_reader = STSBenchmarkDataReader('Data/stsbenchmark', normalize_scores=True) # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset")
'-continue_training-' + model_name + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) #max score is different if args.task_name == 'semeval': max_score, min_score = 2, 0 elif args.task_name == 'askubuntu' or args.task_name == 'quora': max_score, min_score = 1, 0 else: raise NotImplementedError print("max min score", max_score, min_score) sts_reader = STSBenchmarkDataReader(data_path_folder, normalize_scores=True, s1_col_idx=0, s2_col_idx=1, score_col_idx=2, delimiter="\t", min_score=min_score, max_score=max_score) # Load a pre-trained sentence transformer model # Convert the dataset to a DataLoader ready for training logging.info("Read question similarity train dataset") train_dataset = SentencesDataset(sts_reader.get_examples(train_name), model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read question similarity dev dataset") dev_data = SentencesDataset(examples=sts_reader.get_examples(dev_name),
type=str, default="./VnCoreNLP/VnCoreNLP-1.1.1.jar") parser.add_argument('--bpe_path', type=str, default="./PhoBERT") args = parser.parse_args() #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout if not os.path.exists(args.ckpt_path): os.mkdir(args.ckpt_path) # Read the dataset sts_reader = STSBenchmarkDataReader(args.data_path, normalize_scores=True) # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.PhoBERT(args.pre_trained_path, tokenizer_args={ 'vncorenlp_path': args.vncorenlp_path, 'bpe_path': args.bpe_path }) cnn = models.CNN(in_word_embedding_dimension=word_embedding_model. get_word_embedding_dimension(), out_channels=256, kernel_sizes=[1, 3, 5, 5, 3, 1]) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(
# Alternatively, you can also pass a filepath to SentenceTransformer() pretrained = True if pretrained: model = SentenceTransformer(model_name) else: word_embedding_model = models.Transformer('bert-base-uncased') # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, '../datasets/stsbenchmark')) sts_reader = STSBenchmarkDataReader(data_folder, s1_col_idx=0, s2_col_idx=1, score_col_idx=2, delimiter="\t", min_score=0, max_score=1) test_data = SentencesDataset( examples=sts_reader.get_examples("test_sts.tsv"), model=model, ) print("DataLoader") test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8) print("EmbeddingSimilarityEvaluator") evaluator = EmbeddingSimilarityEvaluator(test_dataloader, show_progress_bar=False) print(evaluator)
pooling_model = LayerNPooling( args.pooling, word_embedding_model.get_word_embedding_dimension(), layers=layer_index) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) logger.info("Pool:{}, Encoder:{}, Whitening:{}".format(args.pooling, args.encoder_name, args.whitening)) evaluators = { task: [] for task in target_eval_tasks } #evaluators has a list of different evaluator classes we call periodically sts_reader = STSBenchmarkDataReader( os.path.join(script_folder_path, args.sts_corpus)) for idx, target in enumerate(target_eval_files): output_filename_eval = os.path.join(script_folder_path, args.sts_corpus + target + "-test.csv") if args.whitening: evaluators[target[:5]].append( WhiteningEmbeddingSimilarityEvaluator.from_input_examples( sts_reader.get_examples(output_filename_eval), measure_data_num=target_eval_data_num[idx], embed_dim=args.embed_dim, name=target, main_similarity=SimilarityFunction.COSINE)) else: evaluators[target[:5]].append( EmbeddingSimilarityEvaluator.from_input_examples( sts_reader.get_examples(output_filename_eval),
script_folder_path = os.path.dirname(os.path.realpath(__file__)) #Limit torch to 4 threads torch.set_num_threads(4) #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout model_name = '../training/nli/output/training_nli_bert-base-uncased-2021-01-10_14-44-13' # Load a named sentence model (based on BERT). This will download the model from our server. # Alternatively, you can also pass a filepath to SentenceTransformer() model = SentenceTransformer(model_name) sts_corpus = "../datasets/stsbenchmark/" target_eval_files = set(['sts','sts12', 'sts13', 'sts14', 'sts15', 'sts16', 'sick-r']) evaluators = [] #evaluators has a list of different evaluator classes we call periodically sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, sts_corpus)) for target in target_eval_files: output_filename_eval = os.path.join(script_folder_path,sts_corpus + target + "-test.csv") evaluators.append(EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples(output_filename_eval), name=target)) evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores)) model.evaluate(evaluator)
with open(my_loc + '/proc_data/val-pairs.csv', 'w', encoding='utf-8') as f: for prs in val_pos_pairs: f.write('%s\t%s\t1\n' % (prs[0], prs[1])) # for prs in val_neg_pairs: # f.write('%s\t%s\t0\n'%(prs[0], prs[1])) num_epochs = 4 batch_size = 8 model_save_path = my_loc + '/models/finetune_claim_title_%s' % (fname) sts_reader = STSBenchmarkDataReader(my_loc + '/proc_data/', s1_col_idx=0, s2_col_idx=1, score_col_idx=2, normalize_scores=False, min_score=0, max_score=1) model = SentenceTransformer(emb_type) train_data = SentencesDataset(sts_reader.get_examples('train-pairs.csv'), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = losses.MultipleNegativesRankingLoss(model=model) dev_data = SentencesDataset(examples=sts_reader.get_examples('val-pairs.csv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
from sentence_transformers.readers import STSBenchmarkDataReader import logging import torch #Limit torch to 4 threads, as this example runs on the CPU torch.set_num_threads(4) #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout #1) Point the transformer model to the BERT / RoBERTa etc. model you would like to use. Ensure that output_hidden_states is true word_embedding_model = models.Transformer( 'bert-base-uncased', model_args={'output_hidden_states': True}) #2) Add WKPooling pooling_model = models.WKPooling( word_embedding_model.get_word_embedding_dimension()) #3) Create a sentence transformer model to glue both models together model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark') evaluator = EmbeddingSimilarityEvaluator.from_input_examples( sts_reader.get_examples("sts-test.csv")) model.evaluate(evaluator)
logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-nli-mean-tokens' # Load a named sentence model (based on BERT). This will download the model from our server. # Alternatively, you can also pass a filepath to SentenceTransformer() pretrained = False if pretrained: model = SentenceTransformer(model_name) else: word_embedding_model = models.Transformer('bert-base-uncased') # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, '../datasets/stsbenchmark')) sts_reader = STSBenchmarkDataReader( os.path.join(script_folder_path, 'stsbenchmark')) test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8) evaluator = EmbeddingSimilarityEvaluator(test_dataloader) model.evaluate(evaluator)
from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, models from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator from sentence_transformers.readers import STSBenchmarkDataReader import logging import torch #Limit torch to 4 threads, as this example runs on the CPU torch.set_num_threads(4) #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout #1) Point the transformer model to the BERT / RoBERTa etc. model you would like to use. Ensure that output_hidden_states is true word_embedding_model = models.Transformer('bert-base-uncased', model_args={'output_hidden_states': True}) #2) Add WKPooling pooling_model = models.WKPooling(word_embedding_model.get_word_embedding_dimension()) #3) Create a sentence transformer model to glue both models together model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark') evaluator = EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples("sts-test.csv")) model.evaluate(evaluator)
from datetime import datetime #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout # Read the dataset model_name = 'bert-base-nli-mean-tokens' train_batch_size = 16 num_epochs = 4 model_save_path = 'output/training_stsbenchmark_continue_training-' + model_name + '-' + datetime.now( ).strftime("%Y-%m-%d_%H-%M-%S") sts_reader = STSBenchmarkDataReader( '/checkpoint/xiaojianwu/data/sentBERT/stsbenchmark', normalize_scores=True) # Load a pre-trained sentence transformer model model = SentenceTransformer(model_name) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset") train_dataset = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read STSbenchmark dev dataset") dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'),
args.sts_corpus += "white/" target_eval_files = [f+"-white" for f in target_eval_files] word_embedding_model = models.Transformer(args.encoder_name, model_args={'output_hidden_states': True, 'batch_size': args.batch_size}) total_layers = 12 + 1 results = {i: [] for i in range(total_layers)} for i in range(total_layers): for j in range(total_layers): logger.info("Pool:{}, Encoder:{}, Whitening:{}, L:{}, L:{}".format(args.pooling, args.encoder_name, args.whitening, i, j)) pooling_model = Layer2Pooling(args.pooling, word_embedding_model.get_word_embedding_dimension(), layer_i=i, layer_j=j) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) evaluators = {task: [] for task in target_eval_tasks} # evaluators has a list of different evaluator classes we call periodically sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, args.sts_corpus)) for idx, target in enumerate(target_eval_files): output_filename_eval = os.path.join(script_folder_path, args.sts_corpus + target + "-test.csv") if args.whitening: evaluators[target[:5]].append(WhiteningEmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples(output_filename_eval), measure_data_num=target_eval_data_num[idx], name=target, main_similarity=SimilarityFunction.COSINE)) else: evaluators[target[:5]].append(EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples(output_filename_eval), name=target, main_similarity=SimilarityFunction.COSINE)) _all_results = [] logger_text = "" for task, sequential_evaluator in evaluators.items(): result = model.evaluate(SequentialEvaluator(sequential_evaluator, main_score_function=lambda scores: np.mean(scores))) logger_text += "%.2f \t" % (result * 100) _all_results.append(result * 100) logger.info(" \t".join(target_eval_tasks) + " \tOverall.") logger.info(logger_text + "%.2f"%np.mean(_all_results))
from sentence_transformers import SentenceTransformer, LoggingHandler from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator from sentence_transformers.readers import STSBenchmarkDataReader import logging import sys import os import torch script_folder_path = os.path.dirname(os.path.realpath(__file__)) #Limit torch to 4 threads torch.set_num_threads(4) #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout model_name = sys.argv[1] if len(sys.argv) > 1 else 'paraphrase-distilroberta-base-v1' # Load a named sentence model (based on BERT). This will download the model from our server. # Alternatively, you can also pass a filepath to SentenceTransformer() model = SentenceTransformer(model_name) sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, '../datasets/stsbenchmark')) evaluator = EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples("sts-test.csv"), name='sts-test') model.evaluate(evaluator)
#Limit torch to 4 threads, as this example runs on the CPU torch.set_num_threads(4) #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout #1) Point the transformer model to the BERT / RoBERTa etc. model you would like to use. Ensure that output_hidden_states is true word_embedding_model = models.Transformer( 'bert-base-uncased', model_args={'output_hidden_states': True}) #2) Add WKPooling pooling_model = models.WKPooling( word_embedding_model.get_word_embedding_dimension()) #3) Create a sentence transformer model to glue both models together model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark') test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8) evaluator = EmbeddingSimilarityEvaluator(test_dataloader) model.evaluate(evaluator)
from sentence_transformers.readers import STSBenchmarkDataReader import logging import sys import os import torch script_folder_path = os.path.dirname(os.path.realpath(__file__)) #Limit torch to 4 threads torch.set_num_threads(4) #### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-nli-mean-tokens' # Load a named sentence model (based on BERT). This will download the model from our server. # Alternatively, you can also pass a filepath to SentenceTransformer() model = SentenceTransformer(model_name) sts_reader = STSBenchmarkDataReader( os.path.join(script_folder_path, '../datasets/stsbenchmark')) evaluator = EmbeddingSimilarityEvaluator.from_input_examples( sts_reader.get_examples("sts-test.csv")) model.evaluate(evaluator)
#### Just some code to print debug information to stdout logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased' # Read the dataset train_batch_size = 16 num_epochs = 4 model_save_path = 'output/training_stsbenchmark_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark', normalize_scores=True) # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset") train_dataset = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)