Пример #1
0
from sentence_transformers.readers import STSBenchmarkDataReader
import logging
import sys
import os
import torch

script_folder_path = os.path.dirname(os.path.realpath(__file__))

#Limit torch to 4 threads
torch.set_num_threads(4)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-nli-mean-tokens'

# Load a named sentence model (based on BERT). This will download the model from our server.
# Alternatively, you can also pass a filepath to SentenceTransformer()
model = SentenceTransformer(model_name)

sts_reader = STSBenchmarkDataReader(
    os.path.join(script_folder_path, '../datasets/stsbenchmark'))
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_reader.get_examples("sts-test.csv"))

model.evaluate(evaluator)
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_dataset = SentencesDataset(sts_reader.get_examples('sts-train.csv'),
                                 model)
train_dataloader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'),
                            model=model)
dev_dataloader = DataLoader(dev_data,
                            shuffle=False,
                            batch_size=train_batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

# Configure the training. We skip evaluation in this example
Пример #3
0
from sentence_transformers import SentenceTransformer,  LoggingHandler
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import STSBenchmarkDataReader
import logging
import sys
import os
import torch

script_folder_path = os.path.dirname(os.path.realpath(__file__))

#Limit torch to 4 threads
torch.set_num_threads(4)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

model_name = sys.argv[1] if len(sys.argv) > 1 else 'paraphrase-distilroberta-base-v1'

# Load a named sentence model (based on BERT). This will download the model from our server.
# Alternatively, you can also pass a filepath to SentenceTransformer()
model = SentenceTransformer(model_name)

sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, '../datasets/stsbenchmark'))
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples("sts-test.csv"), name='sts-test')

model.evaluate(evaluator)
Пример #4
0
import logging
import sys
import os
import torch

script_folder_path = os.path.dirname(os.path.realpath(__file__))

#Limit torch to 4 threads
torch.set_num_threads(4)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

model_name = sys.argv[1] if len(
    sys.argv) > 1 else 'paraphrase-distilroberta-base-v1'

# Load a named sentence model (based on BERT). This will download the model from our server.
# Alternatively, you can also pass a filepath to SentenceTransformer()
model = SentenceTransformer(model_name)

sts_reader = STSBenchmarkDataReader(
    os.path.join(script_folder_path, '../datasets/stsbenchmark'))
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_reader.get_examples("sts-test.csv"), name='sts-test')

model.evaluate(evaluator)
Пример #5
0
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
train_dataloader = DataLoader(train_data,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_reader.get_examples('sts-dev.csv'), name='sts-dev')

# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(
    len(train_data) * num_epochs / train_batch_size *
    0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
cnn = models.CNN(in_word_embedding_dimension=word_embedding_model.
                 get_word_embedding_dimension(),
                 out_channels=256,
                 kernel_sizes=[1, 3, 5, 5, 3, 1])
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, cnn, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train_vi.csv'),
                              model)
train_dataloader = DataLoader(train_data,
                              shuffle=True,
                              batch_size=args.batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev_vi.csv'),
                            model=model)
dev_dataloader = DataLoader(dev_data,
                            shuffle=False,
                            batch_size=args.batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

# Configure the training. We skip evaluation in this example
Пример #7
0
    raise NotImplementedError
print("max min score", max_score, min_score)

sts_reader = STSBenchmarkDataReader(data_path_folder,
                                    normalize_scores=True,
                                    s1_col_idx=0,
                                    s2_col_idx=1,
                                    score_col_idx=2,
                                    delimiter="\t",
                                    min_score=min_score,
                                    max_score=max_score)
# Load a pre-trained sentence transformer model

# Convert the dataset to a DataLoader ready for training
logging.info("Read question similarity train dataset")
train_dataset = SentencesDataset(sts_reader.get_examples(train_name), model)
train_dataloader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read question similarity dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples(dev_name),
                            model=model)
dev_dataloader = DataLoader(dev_data,
                            shuffle=False,
                            batch_size=train_batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader,
                                         SimilarityFunction.COSINE)

# Configure the training. We skip evaluation in this example
                                                       args.encoder_name,
                                                       args.whitening))

evaluators = {
    task: []
    for task in target_eval_tasks
}  #evaluators has a list of different evaluator classes we call periodically
sts_reader = STSBenchmarkDataReader(
    os.path.join(script_folder_path, args.sts_corpus))
for idx, target in enumerate(target_eval_files):
    output_filename_eval = os.path.join(script_folder_path,
                                        args.sts_corpus + target + "-test.csv")
    if args.whitening:
        evaluators[target[:5]].append(
            WhiteningEmbeddingSimilarityEvaluator.from_input_examples(
                sts_reader.get_examples(output_filename_eval),
                measure_data_num=target_eval_data_num[idx],
                embed_dim=args.embed_dim,
                name=target,
                main_similarity=SimilarityFunction.COSINE))
    else:
        evaluators[target[:5]].append(
            EmbeddingSimilarityEvaluator.from_input_examples(
                sts_reader.get_examples(output_filename_eval),
                name=target,
                main_similarity=SimilarityFunction.COSINE))

all_results = []
logger_text = ""
for task, sequential_evaluator in evaluators.items():
    result = model.evaluate(
Пример #9
0
    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, '../datasets/stsbenchmark'))
sts_reader = STSBenchmarkDataReader(data_folder,
                                    s1_col_idx=0,
                                    s2_col_idx=1,
                                    score_col_idx=2,
                                    delimiter="\t",
                                    min_score=0,
                                    max_score=1)

test_data = SentencesDataset(
    examples=sts_reader.get_examples("test_sts.tsv"),
    model=model,
)
print("DataLoader")
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8)
print("EmbeddingSimilarityEvaluator")
evaluator = EmbeddingSimilarityEvaluator(test_dataloader,
                                         show_progress_bar=False)

print(evaluator)
# print(model)
# print(model.evaluate)
# exit(1)

model.evaluate(evaluator, output_path)
#Limit torch to 4 threads, as this example runs on the CPU
torch.set_num_threads(4)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#1) Point the transformer model to the BERT / RoBERTa etc. model you would like to use. Ensure that output_hidden_states is true
word_embedding_model = models.Transformer(
    'bert-base-uncased', model_args={'output_hidden_states': True})

#2) Add WKPooling
pooling_model = models.WKPooling(
    word_embedding_model.get_word_embedding_dimension())

#3) Create a sentence transformer model to glue both models together
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark')

test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"),
                             model=model)
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8)
evaluator = EmbeddingSimilarityEvaluator(test_dataloader)

model.evaluate(evaluator)
Пример #11
0
#Limit torch to 4 threads
torch.set_num_threads(4)

script_folder_path = os.path.dirname(os.path.realpath(__file__))


#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-nli-mean-tokens'

# Load a named sentence model (based on BERT). This will download the model from our server.
# Alternatively, you can also pass a filepath to SentenceTransformer()
model = SentenceTransformer(model_name)

sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, '../datasets/stsbenchmark'))
examples = sts_reader.get_examples("sts-train.csv")
sentences = [text for ex in examples for text in ex.texts]
print("Number of sentences:", len(sentences))

start_time = time.time()
emb = model.encode(sentences, batch_size=32)
end_time = time.time()
diff_time = end_time - start_time
print("Done after {:.2f} sec".format(diff_time))
print("Speed: {:.2f}".format(len(sentences) / diff_time))
Пример #12
0
    # f.write('%s\t%s\t0\n'%(prs[0], prs[1]))

num_epochs = 4
batch_size = 8
model_save_path = my_loc + '/models/finetune_claim_title_%s' % (fname)

sts_reader = STSBenchmarkDataReader(my_loc + '/proc_data/',
                                    s1_col_idx=0,
                                    s2_col_idx=1,
                                    score_col_idx=2,
                                    normalize_scores=False,
                                    min_score=0,
                                    max_score=1)

model = SentenceTransformer(emb_type)
train_data = SentencesDataset(sts_reader.get_examples('train-pairs.csv'),
                              model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.MultipleNegativesRankingLoss(model=model)

dev_data = SentencesDataset(examples=sts_reader.get_examples('val-pairs.csv'),
                            model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size *
                         0.1)  #10% of train data for warm-up

model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
Пример #13
0
script_folder_path = os.path.dirname(os.path.realpath(__file__))

#Limit torch to 4 threads
torch.set_num_threads(4)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

model_name = '../training/nli/output/training_nli_bert-base-uncased-2021-01-10_14-44-13'

# Load a named sentence model (based on BERT). This will download the model from our server.
# Alternatively, you can also pass a filepath to SentenceTransformer()
model = SentenceTransformer(model_name)

sts_corpus = "../datasets/stsbenchmark/" 
target_eval_files = set(['sts','sts12', 'sts13', 'sts14', 'sts15', 'sts16', 'sick-r']) 

evaluators = []         #evaluators has a list of different evaluator classes we call periodically
sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, sts_corpus))
for target in target_eval_files:
	output_filename_eval = os.path.join(script_folder_path,sts_corpus + target + "-test.csv")
	evaluators.append(EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples(output_filename_eval), name=target))

evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores))
model.evaluate(evaluator)
total_layers = 12 + 1
results = {i: [] for i in range(total_layers)}
for i in range(total_layers):
    for j in range(total_layers):
        logger.info("Pool:{}, Encoder:{}, Whitening:{}, L:{}, L:{}".format(args.pooling, args.encoder_name, args.whitening, i, j))
        pooling_model = Layer2Pooling(args.pooling, word_embedding_model.get_word_embedding_dimension(), layer_i=i, layer_j=j)

        model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

        evaluators = {task: [] for task in target_eval_tasks}  # evaluators has a list of different evaluator classes we call periodically
        sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, args.sts_corpus))
        for idx, target in enumerate(target_eval_files):
            output_filename_eval = os.path.join(script_folder_path, args.sts_corpus + target + "-test.csv")
            if args.whitening:
                evaluators[target[:5]].append(WhiteningEmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples(output_filename_eval), measure_data_num=target_eval_data_num[idx], name=target, main_similarity=SimilarityFunction.COSINE))
            else:
                evaluators[target[:5]].append(EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples(output_filename_eval), name=target, main_similarity=SimilarityFunction.COSINE))

        _all_results = []
        logger_text = ""
        for task, sequential_evaluator in evaluators.items():
            result = model.evaluate(SequentialEvaluator(sequential_evaluator, main_score_function=lambda scores: np.mean(scores)))
            logger_text += "%.2f \t" % (result * 100)
            _all_results.append(result * 100)
        logger.info(" \t".join(target_eval_tasks) + " \tOverall.")
        logger.info(logger_text + "%.2f"%np.mean(_all_results))
        results[i].append(np.mean(_all_results))

logger.info("***** Finishing evaluation *****")
logger.info("********** Evaluation Results Spearman Cor. **********")
Пример #15
0
sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark', normalize_scores=True)

# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_dataset = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)


logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples('sts-dev.csv'), name='sts-dev')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
Пример #16
0
model_save_path = '../output/training_nli_sts_xlnet'
sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark',
                                    normalize_scores=True)

# Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

# Load a pre-trained sentence transformer model
model = SentenceTransformer(model_name)

# 加载训练集
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
train_dataloader = DataLoader(train_data,
                              shuffle=True,
                              batch_size=train_batch_size,
                              collate_fn=model.smart_batching_collate)
train_loss = losses.CosineSimilarityLoss(model=model)

# 加载验证集
logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'),
                            model=model)
dev_dataloader = DataLoader(dev_data,
                            shuffle=False,
                            batch_size=train_batch_size,
                            collate_fn=model.smart_batching_collate)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)