def main(): args = parse_args() dataset_path = 'examples/datasets/iambot-wikipedia-sections-triplets-all' output_path = 'output/bert-base-wikipedia-sections-mean-tokens-' + \ datetime.now().strftime('%Y-%m-%d_%H-%M-%S') batch_size = 17 num_epochs = 1 is_distributed = torch.cuda.device_count() > 1 and args.local_rank >= 0 if is_distributed: torch.distributed.init_process_group(backend='nccl') model = get_model(local_rank=args.local_rank) logging.info('Read Triplet train dataset') train_data = get_triplet_dataset(dataset_path, 'train.csv', model) train_dataloader = get_data_loader(dataset=train_data, shuffle=True, batch_size=batch_size, distributed=is_distributed) logging.info('Read Wikipedia Triplet dev dataset') dev_dataloader = get_data_loader(dataset=get_triplet_dataset( dataset_path, 'validation.csv', model, 1000), shuffle=False, batch_size=batch_size) evaluator = TripletEvaluator(dev_dataloader) warmup_steps = int(len(train_data) * num_epochs / batch_size * 0.1) loss = st.losses.TripletLoss(model=model) model.fit(train_objectives=[(train_dataloader, loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=output_path, local_rank=args.local_rank) if args.local_rank == 0 or not is_distributed: del model torch.cuda.empty_cache() model = st.SentenceTransformer(output_path) test_data = get_triplet_dataset(dataset_path, 'test.csv', model) test_dataloader = get_data_loader(test_data, shuffle=False, batch_size=batch_size) evaluator = TripletEvaluator(test_dataloader) model.evaluate(evaluator)
def main(model_path, model_type, extra_dataset): # Read the dataset train_batch_size = 64 num_epochs = 20 model_save_path = model_path + '_continue_training_' + datetime.now( ).strftime("%Y_%m_%d_%H_%M_%S") n2c2_reader = TripletReader(extra_dataset) if model_type.lower() in ["bert"]: word_embedding_model = models.BERT(model_path) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) embedder = SentenceTransformer( modules=[word_embedding_model, pooling_model]) #### load sentence BERT models and generate sentence embeddings #### else: #### load sentence BERT models and generate sentence embeddings #### embedder = SentenceTransformer(model_path) # Load a pre-trained sentence transformer model model = SentenceTransformer(model_path) # Convert the dataset to a DataLoader ready for training logging.info("Read extra training dataset") train_data = SentencesDataset(n2c2_reader.get_examples('train.tsv'), model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.TripletLoss(model=model, triplet_margin=triplet_margin) logging.info("Read development dataset") dev_data = SentencesDataset(examples=n2c2_reader.get_examples('dev.tsv'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator = TripletEvaluator(dev_dataloader) # Configure the training. We skip evaluation in this example warmup_steps = math.ceil( len(train_data) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=math.ceil(len(train_data) / train_batch_size), warmup_steps=warmup_steps, output_path=model_save_path)
def __get_triplet_evaluator(tsv_path: Path, name: str): anchors = [] positives = [] negatives = [] with open(str(tsv_path), mode="r", encoding="UTF-8", newline="") as f: for row in csv.reader(f, delimiter="\t"): anchors.append(row[0]) positives.append(row[1]) negatives.append(row[2]) return TripletEvaluator(anchors, positives, negatives, name=name)
def train_contrastive_model(self, slang_ind, params=None, fold_name='default'): if params is None: params = { 'train_batch_size': 16, 'num_epochs': 4, 'triplet_margin': 1, 'outpath': 'SBERT_contrastive' } self.prep_contrastive_training(slang_ind, fold_name=fold_name) out_dir = self.out_dir + '/' + fold_name + '/SBERT_data/' triplet_reader = TripletReader(out_dir, s1_col_idx=0, s2_col_idx=1, s3_col_idx=2, delimiter=',', has_header=True) output_path = out_dir + params['outpath'] sbert_model = SentenceTransformer('bert-base-nli-mean-tokens') train_data = SentencesDataset( examples=triplet_reader.get_examples('contrastive_train.csv'), model=sbert_model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=params['train_batch_size']) train_loss = losses.TripletLoss( model=sbert_model, triplet_margin=params['triplet_margin']) dev_data = SentencesDataset( examples=triplet_reader.get_examples('contrastive_dev.csv'), model=sbert_model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=params['train_batch_size']) evaluator = TripletEvaluator(dev_dataloader) warmup_steps = int( len(train_data) * params['num_epochs'] / params['train_batch_size'] * 0.1) #10% of train data # Train the model sbert_model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=params['num_epochs'], evaluation_steps=len(dev_data), warmup_steps=warmup_steps, output_path=output_path)
def test(conf: "TrainConfig"): logger = logging.getLogger(__name__) logger.info("Start test") model = SentenceTransformer(str(conf.model_dir)) model.tokenizer = AutoTokenizer.from_pretrained(conf.transformer_model) logger.info(f"model: {type(model)}") logger.info(f"tokenizer: {type(model.tokenizer)}") encode_result = model.tokenizer(["日本語のトークナイゼーションの問題"], return_tensors='pt', padding=True) logger.info(model.tokenizer.convert_ids_to_tokens(encode_result.input_ids.flatten().tolist())) triplet_reader = TripletReader(str(conf.train_triplets_tsv.parent)) evaluator = TripletEvaluator.from_input_examples( triplet_reader.get_examples(conf.test_triplets_tsv.name), name="test" ) evaluator(model, output_path=str(conf.model_dir))
def train(conf: "TrainConfig"): logger = logging.getLogger(__name__) logger.info("Initialize model") transformer = models.Transformer(conf.transformer_model) pooling = models.Pooling( transformer.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False, ) model = SentenceTransformer(modules=[transformer, pooling]) model.tokenizer = AutoTokenizer.from_pretrained(conf.transformer_model) logger.info(f"model: {type(model)}") logger.info(f"tokenizer: {type(model.tokenizer)}") encode_result = model.tokenizer(["日本語のトークナイゼーションの問題"], return_tensors='pt', padding=True) logger.info(model.tokenizer.convert_ids_to_tokens(encode_result.input_ids.flatten().tolist())) logger.info("Read training data") triplet_reader = TripletReader(str(conf.train_triplets_tsv.parent)) train_data = SentencesDataset( triplet_reader.get_examples(conf.train_triplets_tsv.name), model=model ) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=conf.batch_size) train_loss = TripletLoss( model=model, distance_metric=TripletDistanceMetric.EUCLIDEAN, triplet_margin=1 ) evaluator = TripletEvaluator.from_input_examples( triplet_reader.get_examples(conf.dev_triplets_tsv.name), name="dev" ) logger.info("Start training") warmup_steps = int(len(train_data) // conf.batch_size * 0.1) model.fit( train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=conf.epochs, evaluation_steps=conf.eval_steps, warmup_steps=warmup_steps, output_path=str(conf.model_dir), )
shuffle=True, batch_size=train_batch_size) ### Triplet losses #################### ### There are 3 triplet loss variants: ### - BatchHardTripletLoss ### - BatchHardSoftMarginTripletLoss ### - BatchSemiHardTripletLoss ####################################### #train_loss = losses.BatchHardTripletLoss(sentence_embedder=model) #train_loss = losses.BatchHardSoftMarginTripletLoss(sentence_embedder=model) train_loss = losses.BatchSemiHardTripletLoss(sentence_embedder=model) logging.info("Read TREC val dataset") dev_evaluator = TripletEvaluator.from_input_examples(dev_set, name='dev') logging.info("Performance before fine-tuning:") dev_evaluator(model) warmup_steps = int(len(train_dataset) * num_epochs / train_batch_size * 0.1) # 10% of train data # Train the model model.fit( train_objectives=[(train_dataloader, train_loss)], evaluator=dev_evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=output_path,
### There are 3 triplet loss variants: ### - BatchHardTripletLoss ### - BatchHardSoftMarginTripletLoss ### - BatchSemiHardTripletLoss ####################################### #train_loss = losses.BatchHardTripletLoss(sentence_embedder=model) #train_loss = losses.BatchHardSoftMarginTripletLoss(sentence_embedder=model) train_loss = losses.BatchSemiHardTripletLoss(sentence_embedder=model) logging.info("Read TREC val dataset") dataset_dev = SentenceLabelDataset(examples=val, model=model) dev_dataloader = DataLoader(dataset_dev, shuffle=False, batch_size=train_batch_size) evaluator = TripletEvaluator(dev_dataloader) warmup_steps = int( len(train_dataloader) * num_epochs / train_batch_size * 0.1) # 10% of train data # Train the model model.fit( train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=output_path, )
import torch from torch.utils.data import DataLoader, RandomSampler from scipy.spatial.distance import cdist torch.cuda.empty_cache() my_model_path = '/run/media/root/Windows/Users/agnes/Downloads/data/msmarco/train_results/test_wiki' model_wiki = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens') dev_dataloader = torch.load(os.path.join(my_model_path, 'dev_dataloader.pth')) train_dataloader = torch.load( os.path.join(my_model_path, 'train_dataloader.pth')) evaluator = TripletEvaluator(dev_dataloader) optimizer_class = transformers.AdamW optimizer_params = {'lr': 2e-4, 'eps': 1e-6, 'correct_bias': False} train_loss = losses.TripletLoss(model=model_wiki) num_epochs = 4 warmup_steps = math.ceil( len(train_dataloader.dataset) * num_epochs / train_dataloader.batch_size * 0.05) #5% of train data for warm-up model_wiki.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, steps_per_epoch=8000, warmup_steps=warmup_steps,
def train(triplet_data_dir, output): logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) ### Create a torch.DataLoader that passes training batch instances to our model train_batch_size = 16 triplet_reader = TripletReader(triplet_data_dir, s1_col_idx=1, s2_col_idx=2, s3_col_idx=3, delimiter=',', quoting=csv.QUOTE_MINIMAL, has_header=True) # output_path = "output/bert-base-wikipedia-sections-mean-tokens-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") output_path = output + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") num_epochs = 1 ### Configure sentence transformers for training and train on the provided dataset # Use BERT for mapping tokens to embeddings word_embedding_model = models.BERT('bert-base-uncased') # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) logging.info("Read Triplet train dataset") train_data = SentencesDataset(examples=triplet_reader.get_examples( 'train.csv', 2000000), model=model) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.TripletLoss(model=model) logging.info("Read Wikipedia Triplet dev dataset") dev_data = SentencesDataset(examples=triplet_reader.get_examples( 'validation.csv', 10000), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) evaluator = TripletEvaluator(dev_dataloader) warmup_steps = int(len(train_data) * num_epochs / train_batch_size * 0.1) #10% of train data # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=output_path) ############################################################################## # # Load the stored model and evaluate its performance on STS benchmark dataset # ############################################################################## model = SentenceTransformer(output_path) test_data = SentencesDataset( examples=triplet_reader.get_examples('test.csv'), model=model) test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) evaluator = TripletEvaluator(test_dataloader) model.evaluate(evaluator)
pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) logging.info("Read Triplet train dataset") train_dataset = SentencesDataset(examples=triplet_reader.get_examples( 'train.csv', max_examples=100000), model=model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.TripletLoss(model=model) logging.info("Read Wikipedia Triplet dev dataset") evaluator = TripletEvaluator.from_input_examples(triplet_reader.get_examples( 'validation.csv', 1000), name='dev') warmup_steps = int(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluator, epochs=num_epochs, evaluation_steps=1000, warmup_steps=warmup_steps, output_path=output_path) ############################################################################## #