sts_dataset_path = 'data/stsbenchmark.tsv.gz' if not os.path.exists(sts_dataset_path): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) train_samples = [] dev_samples = [] test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score) if row['split'] == 'dev': dev_samples.append(inp_example) elif row['split'] == 'test': test_samples.append(inp_example) else: train_samples.append(inp_example) evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev') model.evaluate(evaluator) evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test') model.evaluate(evaluator)
pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset") train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read STSbenchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples( sts_reader.get_examples('sts-dev.csv'), name='sts-dev') # Configure the training. We skip evaluation in this example warmup_steps = math.ceil( len(train_data) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=model_save_path)
def test(model_save_path, sts_dataset_path, train_batch_size): test_samples = read_dataset(sts_dataset_path, "test") model = SentenceTransformer(model_save_path) test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( test_samples, batch_size=train_batch_size, name='sts-test') test_evaluator(model, output_path=model_save_path)
out_features=sent_embeddings_dimension) dan2 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension) model = SentenceTransformer( modules=[word_embedding_model, word_weights, pooling_model, dan1, dan2]) # Convert the dataset to a DataLoader ready for training logging.info("Read STSbenchmark train dataset") train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read STSbenchmark dev dataset") evaluator = EmbeddingSimilarityEvaluator.from_input_examples( sts_reader.get_examples('sts-dev.csv')) # Configure the training num_epochs = 10 warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, warmup_steps=warmup_steps, output_path=model_save_path) ##############################################################################
# Convert the dataset to a DataLoader ready for training logging.info("Read CSTS-B train dataset") train_data = SentencesDataset( csts_reader.get_examples('cnsd-sts-train.txt'), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.CosineSimilarityLoss(model=model) logging.info("Read CSTS-B dev dataset") dev_data = SentencesDataset( examples=csts_reader.get_examples('cnsd-sts-dev.txt'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) # Configure the training. We skip evaluation in this example warmup_steps = math.ceil( len(train_data) * num_epochs / train_batch_size * 0.1) # 10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=100, warmup_steps=warmup_steps, output_path=model_save_path)
model, show_progress_bar=True) pickle.dump(train_data, open("train_data.pkl", "wb")) # train_data = pickle.load(open("train_data.pkl", "rb")) train_loader = DataLoader(train_data, batch_size=16, shuffle=True) dev_data = SentencesDataset( all_examples[train_dev_test_split[0]:train_dev_test_split[1]], model, show_progress_bar=True) dev_sampler = RandomSampler(dev_data, replacement=True, num_samples=2000) dev_loader = DataLoader(dev_data, batch_size=16, sampler=dev_sampler) train_loss = losses.CosineSimilarityLoss(model=model) evaluator = EmbeddingSimilarityEvaluator(dev_loader, show_progress_bar=True, device=device) model.fit(train_objectives=[(train_loader, train_loss)], evaluator=evaluator, epochs=10, evaluation_steps=2000, warmup_steps=int(217206 / 5), output_path="train_sampled_eval4", optimizer_params={ 'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False })
train_loss = losses.MultipleNegativesRankingLoss(model) #Read STSbenchmark dataset and use it as development set logging.info("Read STSbenchmark dev dataset") dev_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: if row['split'] == 'dev': score = float( row['score']) / 5.0 #Normalize score to range 0 ... 1 dev_samples.append( InputExample(texts=[row['sentence1'], row['sentence2']], label=score)) dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples( dev_samples, batch_size=train_batch_size, name='sts-dev') # Configure the training warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit( train_objectives=[(train_dataloader, train_loss)], evaluator=dev_evaluator, epochs=num_epochs, evaluation_steps=int(len(train_dataloader) * 0.1), warmup_steps=warmup_steps, output_path=model_save_path, use_amp=False #Set to True, if your GPU supports FP16 operations
import torch from torch.utils.data import DataLoader, RandomSampler from scipy.spatial.distance import cdist torch.cuda.empty_cache() my_model_path = 'msmarco/models/test_model5' model_1 = SentenceTransformer('roberta-large-nli-stsb-mean-tokens') dev_dataloader = torch.load(os.path.join('msmarco/models/test_model5', 'dev_dataloader.pth')) train_dataloader = torch.load(os.path.join('msmarco/models/test_model5', 'train_dataloader.pth')) evaluator1 = BinaryEmbeddingSimilarityEvaluator(dev_dataloader) evaluator2 = EmbeddingSimilarityEvaluator(dev_dataloader) evaluator = SequentialEvaluator([evaluator1, evaluator2], main_score_function = lambda scores: scores[0]) optimizer_class = transformers.AdamW optimizer_params = {'lr': 2e-6, 'eps': 1e-6, 'correct_bias': False} train_loss = losses.CosineSimilarityLoss(model=model_1) num_epochs = 100 warmup_steps = math.ceil(len(train_dataloader.dataset)*num_epochs / train_dataloader.batch_size*0.1) #10% of train data for warm-up model_1.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, steps_per_epoch=1000, warmup_steps=warmup_steps, optimizer_class=optimizer_class,