Пример #1
0
def main(args):
	# Read the dataset
	args.output_dir = os.path.join(args.output_dir, 
									f'training_nli_sts-sci_bert-num_epochs_{args.num_epochs}-bs_{args.batch_size}-lr_{args.learning_rate}')
	logger.info(f"the output directory is {args.output_dir}.")

	if avoid_duplicate(args):
		logger.info(f"the experiment is previously done.")
		sys.exit(0)

	if args.should_continue:
		sorted_checkpoints_list = sorted_checkpoints(args)
		if len(sorted_checkpoints_list) == 0:
			raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
		elif len(sorted_checkpoints_list) == 1:
			raise ValueError("In theory, the second last checkpoint should be read when using ephemeral.")
		else:
			args.sentence_model_path = sorted_checkpoints_list[-2]
		model = SentenceTransformer_tb(saved_scibert_model_path=args.sentence_model_path)
		# Load a pre-trained sentence transformer model
		# args.sentence_model_path has optimizer and scheduler
	else:
		model = SentenceTransformer_tb(saved_scibert_model_path=args.finetuned_model_path)

	sts_reader = STSBenchmarkDataReader(args.data_directory, normalize_scores=True)

	# Convert the dataset to a DataLoader ready for training and dev.
	train_data, train_loss, train_dataloader, train_evaluator = train_config(sts_reader, model, args.batch_size)
	dev_loss, dev_dataloader, dev_evaluator = dev_config(sts_reader, model, args.batch_size)

	# Configure the training. We skip evaluation in this example
	warmup_steps = math.ceil(len(train_data)*args.num_epochs/args.batch_size*0.1) #10% of train data for warm-up
	optimizer_params = {'lr': args.learning_rate, 'eps': 1e-6, 'correct_bias': False}
	logger.info(f"Warmup-steps: {warmup_steps}")

	# Train the model
	model.fit(args=args,
			train_objectives=[(train_dataloader, train_loss)],
			eval_objectives=[(dev_dataloader, dev_loss)],
			train_evaluator=train_evaluator,
			evaluator=dev_evaluator,
			train_phase='STS',
			epochs=args.num_epochs,
			evaluation_steps=50 if not args.debug_mode else 2,
			optimizer_params=optimizer_params,
			warmup_steps=warmup_steps)
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased'

# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = 'output/training_stsbenchmark_' + model_name.replace(
    "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark',
                                    normalize_scores=True)

# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
Пример #3
0
import sys

#print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

model_name = 'bert-base-uncased'

# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = 'output/training_stsbenchmark_' + model_name.replace(
    "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
sts_reader = STSBenchmarkDataReader('Data/stsbenchmark', normalize_scores=True)

# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
Пример #4
0
    '-continue_training-' + model_name + '-' +
    datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

#max score is different
if args.task_name == 'semeval':
    max_score, min_score = 2, 0
elif args.task_name == 'askubuntu' or args.task_name == 'quora':
    max_score, min_score = 1, 0
else:
    raise NotImplementedError
print("max min score", max_score, min_score)

sts_reader = STSBenchmarkDataReader(data_path_folder,
                                    normalize_scores=True,
                                    s1_col_idx=0,
                                    s2_col_idx=1,
                                    score_col_idx=2,
                                    delimiter="\t",
                                    min_score=min_score,
                                    max_score=max_score)
# Load a pre-trained sentence transformer model

# Convert the dataset to a DataLoader ready for training
logging.info("Read question similarity train dataset")
train_dataset = SentencesDataset(sts_reader.get_examples(train_name), model)
train_dataloader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read question similarity dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples(dev_name),
                    type=str,
                    default="./VnCoreNLP/VnCoreNLP-1.1.1.jar")
parser.add_argument('--bpe_path', type=str, default="./PhoBERT")
args = parser.parse_args()

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout
if not os.path.exists(args.ckpt_path):
    os.mkdir(args.ckpt_path)

# Read the dataset
sts_reader = STSBenchmarkDataReader(args.data_path, normalize_scores=True)

# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.PhoBERT(args.pre_trained_path,
                                      tokenizer_args={
                                          'vncorenlp_path':
                                          args.vncorenlp_path,
                                          'bpe_path': args.bpe_path
                                      })

cnn = models.CNN(in_word_embedding_dimension=word_embedding_model.
                 get_word_embedding_dimension(),
                 out_channels=256,
                 kernel_sizes=[1, 3, 5, 5, 3, 1])
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
Пример #6
0
# Alternatively, you can also pass a filepath to SentenceTransformer()
pretrained = True
if pretrained:
    model = SentenceTransformer(model_name)
else:
    word_embedding_model = models.Transformer('bert-base-uncased')
    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, '../datasets/stsbenchmark'))
sts_reader = STSBenchmarkDataReader(data_folder,
                                    s1_col_idx=0,
                                    s2_col_idx=1,
                                    score_col_idx=2,
                                    delimiter="\t",
                                    min_score=0,
                                    max_score=1)

test_data = SentencesDataset(
    examples=sts_reader.get_examples("test_sts.tsv"),
    model=model,
)
print("DataLoader")
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8)
print("EmbeddingSimilarityEvaluator")
evaluator = EmbeddingSimilarityEvaluator(test_dataloader,
                                         show_progress_bar=False)

print(evaluator)
    pooling_model = LayerNPooling(
        args.pooling,
        word_embedding_model.get_word_embedding_dimension(),
        layers=layer_index)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

logger.info("Pool:{}, Encoder:{}, Whitening:{}".format(args.pooling,
                                                       args.encoder_name,
                                                       args.whitening))

evaluators = {
    task: []
    for task in target_eval_tasks
}  #evaluators has a list of different evaluator classes we call periodically
sts_reader = STSBenchmarkDataReader(
    os.path.join(script_folder_path, args.sts_corpus))
for idx, target in enumerate(target_eval_files):
    output_filename_eval = os.path.join(script_folder_path,
                                        args.sts_corpus + target + "-test.csv")
    if args.whitening:
        evaluators[target[:5]].append(
            WhiteningEmbeddingSimilarityEvaluator.from_input_examples(
                sts_reader.get_examples(output_filename_eval),
                measure_data_num=target_eval_data_num[idx],
                embed_dim=args.embed_dim,
                name=target,
                main_similarity=SimilarityFunction.COSINE))
    else:
        evaluators[target[:5]].append(
            EmbeddingSimilarityEvaluator.from_input_examples(
                sts_reader.get_examples(output_filename_eval),
Пример #8
0
script_folder_path = os.path.dirname(os.path.realpath(__file__))

#Limit torch to 4 threads
torch.set_num_threads(4)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

model_name = '../training/nli/output/training_nli_bert-base-uncased-2021-01-10_14-44-13'

# Load a named sentence model (based on BERT). This will download the model from our server.
# Alternatively, you can also pass a filepath to SentenceTransformer()
model = SentenceTransformer(model_name)

sts_corpus = "../datasets/stsbenchmark/" 
target_eval_files = set(['sts','sts12', 'sts13', 'sts14', 'sts15', 'sts16', 'sick-r']) 

evaluators = []         #evaluators has a list of different evaluator classes we call periodically
sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, sts_corpus))
for target in target_eval_files:
	output_filename_eval = os.path.join(script_folder_path,sts_corpus + target + "-test.csv")
	evaluators.append(EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples(output_filename_eval), name=target))

evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores))
model.evaluate(evaluator)
Пример #9
0
with open(my_loc + '/proc_data/val-pairs.csv', 'w', encoding='utf-8') as f:
    for prs in val_pos_pairs:
        f.write('%s\t%s\t1\n' % (prs[0], prs[1]))

    # for prs in val_neg_pairs:
    # f.write('%s\t%s\t0\n'%(prs[0], prs[1]))

num_epochs = 4
batch_size = 8
model_save_path = my_loc + '/models/finetune_claim_title_%s' % (fname)

sts_reader = STSBenchmarkDataReader(my_loc + '/proc_data/',
                                    s1_col_idx=0,
                                    s2_col_idx=1,
                                    score_col_idx=2,
                                    normalize_scores=False,
                                    min_score=0,
                                    max_score=1)

model = SentenceTransformer(emb_type)
train_data = SentencesDataset(sts_reader.get_examples('train-pairs.csv'),
                              model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.MultipleNegativesRankingLoss(model=model)

dev_data = SentencesDataset(examples=sts_reader.get_examples('val-pairs.csv'),
                            model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
Пример #10
0
from sentence_transformers.readers import STSBenchmarkDataReader
import logging
import torch

#Limit torch to 4 threads, as this example runs on the CPU
torch.set_num_threads(4)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#1) Point the transformer model to the BERT / RoBERTa etc. model you would like to use. Ensure that output_hidden_states is true
word_embedding_model = models.Transformer(
    'bert-base-uncased', model_args={'output_hidden_states': True})

#2) Add WKPooling
pooling_model = models.WKPooling(
    word_embedding_model.get_word_embedding_dimension())

#3) Create a sentence transformer model to glue both models together
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark')
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_reader.get_examples("sts-test.csv"))

model.evaluate(evaluator)
Пример #11
0
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-nli-mean-tokens'

# Load a named sentence model (based on BERT). This will download the model from our server.
# Alternatively, you can also pass a filepath to SentenceTransformer()
pretrained = False
if pretrained:
    model = SentenceTransformer(model_name)
else:
    word_embedding_model = models.Transformer('bert-base-uncased')
    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, '../datasets/stsbenchmark'))
sts_reader = STSBenchmarkDataReader(
    os.path.join(script_folder_path, 'stsbenchmark'))

test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"),
                             model=model)
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8)
evaluator = EmbeddingSimilarityEvaluator(test_dataloader)

model.evaluate(evaluator)
Пример #12
0
from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, models
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import STSBenchmarkDataReader
import logging
import torch

#Limit torch to 4 threads, as this example runs on the CPU
torch.set_num_threads(4)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout


#1) Point the transformer model to the BERT / RoBERTa etc. model you would like to use. Ensure that output_hidden_states is true
word_embedding_model = models.Transformer('bert-base-uncased', model_args={'output_hidden_states': True})

#2) Add WKPooling
pooling_model = models.WKPooling(word_embedding_model.get_word_embedding_dimension())

#3) Create a sentence transformer model to glue both models together
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark')
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples("sts-test.csv"))

model.evaluate(evaluator)
from datetime import datetime

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Read the dataset
model_name = 'bert-base-nli-mean-tokens'
train_batch_size = 16
num_epochs = 4
model_save_path = 'output/training_stsbenchmark_continue_training-' + model_name + '-' + datetime.now(
).strftime("%Y-%m-%d_%H-%M-%S")
sts_reader = STSBenchmarkDataReader(
    '/checkpoint/xiaojianwu/data/sentBERT/stsbenchmark', normalize_scores=True)

# Load a pre-trained sentence transformer model
model = SentenceTransformer(model_name)

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_dataset = SentencesDataset(sts_reader.get_examples('sts-train.csv'),
                                 model)
train_dataloader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'),
    args.sts_corpus += "white/"
    target_eval_files = [f+"-white" for f in target_eval_files]

word_embedding_model = models.Transformer(args.encoder_name, model_args={'output_hidden_states': True, 'batch_size': args.batch_size})

total_layers = 12 + 1
results = {i: [] for i in range(total_layers)}
for i in range(total_layers):
    for j in range(total_layers):
        logger.info("Pool:{}, Encoder:{}, Whitening:{}, L:{}, L:{}".format(args.pooling, args.encoder_name, args.whitening, i, j))
        pooling_model = Layer2Pooling(args.pooling, word_embedding_model.get_word_embedding_dimension(), layer_i=i, layer_j=j)

        model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

        evaluators = {task: [] for task in target_eval_tasks}  # evaluators has a list of different evaluator classes we call periodically
        sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, args.sts_corpus))
        for idx, target in enumerate(target_eval_files):
            output_filename_eval = os.path.join(script_folder_path, args.sts_corpus + target + "-test.csv")
            if args.whitening:
                evaluators[target[:5]].append(WhiteningEmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples(output_filename_eval), measure_data_num=target_eval_data_num[idx], name=target, main_similarity=SimilarityFunction.COSINE))
            else:
                evaluators[target[:5]].append(EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples(output_filename_eval), name=target, main_similarity=SimilarityFunction.COSINE))

        _all_results = []
        logger_text = ""
        for task, sequential_evaluator in evaluators.items():
            result = model.evaluate(SequentialEvaluator(sequential_evaluator, main_score_function=lambda scores: np.mean(scores)))
            logger_text += "%.2f \t" % (result * 100)
            _all_results.append(result * 100)
        logger.info(" \t".join(target_eval_tasks) + " \tOverall.")
        logger.info(logger_text + "%.2f"%np.mean(_all_results))
Пример #15
0
from sentence_transformers import SentenceTransformer,  LoggingHandler
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import STSBenchmarkDataReader
import logging
import sys
import os
import torch

script_folder_path = os.path.dirname(os.path.realpath(__file__))

#Limit torch to 4 threads
torch.set_num_threads(4)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

model_name = sys.argv[1] if len(sys.argv) > 1 else 'paraphrase-distilroberta-base-v1'

# Load a named sentence model (based on BERT). This will download the model from our server.
# Alternatively, you can also pass a filepath to SentenceTransformer()
model = SentenceTransformer(model_name)

sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, '../datasets/stsbenchmark'))
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples("sts-test.csv"), name='sts-test')

model.evaluate(evaluator)
#Limit torch to 4 threads, as this example runs on the CPU
torch.set_num_threads(4)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#1) Point the transformer model to the BERT / RoBERTa etc. model you would like to use. Ensure that output_hidden_states is true
word_embedding_model = models.Transformer(
    'bert-base-uncased', model_args={'output_hidden_states': True})

#2) Add WKPooling
pooling_model = models.WKPooling(
    word_embedding_model.get_word_embedding_dimension())

#3) Create a sentence transformer model to glue both models together
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark')

test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"),
                             model=model)
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8)
evaluator = EmbeddingSimilarityEvaluator(test_dataloader)

model.evaluate(evaluator)
Пример #17
0
from sentence_transformers.readers import STSBenchmarkDataReader
import logging
import sys
import os
import torch

script_folder_path = os.path.dirname(os.path.realpath(__file__))

#Limit torch to 4 threads
torch.set_num_threads(4)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-nli-mean-tokens'

# Load a named sentence model (based on BERT). This will download the model from our server.
# Alternatively, you can also pass a filepath to SentenceTransformer()
model = SentenceTransformer(model_name)

sts_reader = STSBenchmarkDataReader(
    os.path.join(script_folder_path, '../datasets/stsbenchmark'))
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_reader.get_examples("sts-test.csv"))

model.evaluate(evaluator)
Пример #18
0
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
model_name = sys.argv[1] if len(sys.argv) > 1 else  'bert-base-uncased'

# Read the dataset
train_batch_size = 16
num_epochs = 4
model_save_path = 'output/training_stsbenchmark_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark', normalize_scores=True)

# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_dataset = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)