Exemplo n.º 1
0
def main():
    args = parse_args()

    dataset_path = 'examples/datasets/iambot-wikipedia-sections-triplets-all'

    output_path = 'output/bert-base-wikipedia-sections-mean-tokens-' + \
                  datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

    batch_size = 17
    num_epochs = 1

    is_distributed = torch.cuda.device_count() > 1 and args.local_rank >= 0

    if is_distributed:
        torch.distributed.init_process_group(backend='nccl')

    model = get_model(local_rank=args.local_rank)

    logging.info('Read Triplet train dataset')
    train_data = get_triplet_dataset(dataset_path, 'train.csv', model)
    train_dataloader = get_data_loader(dataset=train_data,
                                       shuffle=True,
                                       batch_size=batch_size,
                                       distributed=is_distributed)

    logging.info('Read Wikipedia Triplet dev dataset')
    dev_dataloader = get_data_loader(dataset=get_triplet_dataset(
        dataset_path, 'validation.csv', model, 1000),
                                     shuffle=False,
                                     batch_size=batch_size)
    evaluator = TripletEvaluator(dev_dataloader)

    warmup_steps = int(len(train_data) * num_epochs / batch_size * 0.1)

    loss = st.losses.TripletLoss(model=model)

    model.fit(train_objectives=[(train_dataloader, loss)],
              evaluator=evaluator,
              epochs=num_epochs,
              evaluation_steps=1000,
              warmup_steps=warmup_steps,
              output_path=output_path,
              local_rank=args.local_rank)

    if args.local_rank == 0 or not is_distributed:
        del model
        torch.cuda.empty_cache()

        model = st.SentenceTransformer(output_path)
        test_data = get_triplet_dataset(dataset_path, 'test.csv', model)
        test_dataloader = get_data_loader(test_data,
                                          shuffle=False,
                                          batch_size=batch_size)
        evaluator = TripletEvaluator(test_dataloader)

        model.evaluate(evaluator)
Exemplo n.º 2
0
def main(model_path, model_type, extra_dataset):
    # Read the dataset
    train_batch_size = 64
    num_epochs = 20
    model_save_path = model_path + '_continue_training_' + datetime.now(
    ).strftime("%Y_%m_%d_%H_%M_%S")
    n2c2_reader = TripletReader(extra_dataset)

    if model_type.lower() in ["bert"]:
        word_embedding_model = models.BERT(model_path)

        # Apply mean pooling to get one fixed sized sentence vector
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)

        embedder = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
        #### load sentence BERT models and generate sentence embeddings ####
    else:
        #### load sentence BERT models and generate sentence embeddings ####
        embedder = SentenceTransformer(model_path)

    # Load a pre-trained sentence transformer model
    model = SentenceTransformer(model_path)

    # Convert the dataset to a DataLoader ready for training
    logging.info("Read extra training dataset")
    train_data = SentencesDataset(n2c2_reader.get_examples('train.tsv'), model)
    train_dataloader = DataLoader(train_data,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.TripletLoss(model=model, triplet_margin=triplet_margin)

    logging.info("Read development dataset")
    dev_data = SentencesDataset(examples=n2c2_reader.get_examples('dev.tsv'),
                                model=model)
    dev_dataloader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=train_batch_size)
    evaluator = TripletEvaluator(dev_dataloader)

    # Configure the training. We skip evaluation in this example
    warmup_steps = math.ceil(
        len(train_data) * num_epochs / train_batch_size *
        0.1)  #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=num_epochs,
              evaluation_steps=math.ceil(len(train_data) / train_batch_size),
              warmup_steps=warmup_steps,
              output_path=model_save_path)
def __get_triplet_evaluator(tsv_path: Path, name: str):
    anchors = []
    positives = []
    negatives = []
    with open(str(tsv_path), mode="r", encoding="UTF-8", newline="") as f:
        for row in csv.reader(f, delimiter="\t"):
            anchors.append(row[0])
            positives.append(row[1])
            negatives.append(row[2])
    return TripletEvaluator(anchors, positives, negatives, name=name)
Exemplo n.º 4
0
    def train_contrastive_model(self,
                                slang_ind,
                                params=None,
                                fold_name='default'):

        if params is None:
            params = {
                'train_batch_size': 16,
                'num_epochs': 4,
                'triplet_margin': 1,
                'outpath': 'SBERT_contrastive'
            }

        self.prep_contrastive_training(slang_ind, fold_name=fold_name)

        out_dir = self.out_dir + '/' + fold_name + '/SBERT_data/'

        triplet_reader = TripletReader(out_dir,
                                       s1_col_idx=0,
                                       s2_col_idx=1,
                                       s3_col_idx=2,
                                       delimiter=',',
                                       has_header=True)
        output_path = out_dir + params['outpath']

        sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

        train_data = SentencesDataset(
            examples=triplet_reader.get_examples('contrastive_train.csv'),
            model=sbert_model)
        train_dataloader = DataLoader(train_data,
                                      shuffle=True,
                                      batch_size=params['train_batch_size'])
        train_loss = losses.TripletLoss(
            model=sbert_model, triplet_margin=params['triplet_margin'])

        dev_data = SentencesDataset(
            examples=triplet_reader.get_examples('contrastive_dev.csv'),
            model=sbert_model)
        dev_dataloader = DataLoader(dev_data,
                                    shuffle=False,
                                    batch_size=params['train_batch_size'])
        evaluator = TripletEvaluator(dev_dataloader)

        warmup_steps = int(
            len(train_data) * params['num_epochs'] /
            params['train_batch_size'] * 0.1)  #10% of train data

        # Train the model
        sbert_model.fit(train_objectives=[(train_dataloader, train_loss)],
                        evaluator=evaluator,
                        epochs=params['num_epochs'],
                        evaluation_steps=len(dev_data),
                        warmup_steps=warmup_steps,
                        output_path=output_path)
def test(conf: "TrainConfig"):
    logger = logging.getLogger(__name__)
    logger.info("Start test")

    model = SentenceTransformer(str(conf.model_dir))
    model.tokenizer = AutoTokenizer.from_pretrained(conf.transformer_model)
    logger.info(f"model: {type(model)}")
    logger.info(f"tokenizer: {type(model.tokenizer)}")

    encode_result = model.tokenizer(["日本語のトークナイゼーションの問題"], return_tensors='pt', padding=True)
    logger.info(model.tokenizer.convert_ids_to_tokens(encode_result.input_ids.flatten().tolist()))

    triplet_reader = TripletReader(str(conf.train_triplets_tsv.parent))
    evaluator = TripletEvaluator.from_input_examples(
        triplet_reader.get_examples(conf.test_triplets_tsv.name), name="test"
    )
    evaluator(model, output_path=str(conf.model_dir))
def train(conf: "TrainConfig"):
    logger = logging.getLogger(__name__)
    logger.info("Initialize model")
    transformer = models.Transformer(conf.transformer_model)

    pooling = models.Pooling(
        transformer.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False,
    )

    model = SentenceTransformer(modules=[transformer, pooling])
    model.tokenizer = AutoTokenizer.from_pretrained(conf.transformer_model)
    logger.info(f"model: {type(model)}")
    logger.info(f"tokenizer: {type(model.tokenizer)}")
    encode_result = model.tokenizer(["日本語のトークナイゼーションの問題"], return_tensors='pt', padding=True)
    logger.info(model.tokenizer.convert_ids_to_tokens(encode_result.input_ids.flatten().tolist()))

    logger.info("Read training data")
    triplet_reader = TripletReader(str(conf.train_triplets_tsv.parent))
    train_data = SentencesDataset(
        triplet_reader.get_examples(conf.train_triplets_tsv.name), model=model
    )
    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=conf.batch_size)
    train_loss = TripletLoss(
        model=model, distance_metric=TripletDistanceMetric.EUCLIDEAN, triplet_margin=1
    )

    evaluator = TripletEvaluator.from_input_examples(
        triplet_reader.get_examples(conf.dev_triplets_tsv.name), name="dev"
    )

    logger.info("Start training")
    warmup_steps = int(len(train_data) // conf.batch_size * 0.1)
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        evaluator=evaluator,
        epochs=conf.epochs,
        evaluation_steps=conf.eval_steps,
        warmup_steps=warmup_steps,
        output_path=str(conf.model_dir),
    )
Exemplo n.º 7
0
                              shuffle=True,
                              batch_size=train_batch_size)

### Triplet losses ####################
### There are 3 triplet loss variants:
### - BatchHardTripletLoss
### - BatchHardSoftMarginTripletLoss
### - BatchSemiHardTripletLoss
#######################################

#train_loss = losses.BatchHardTripletLoss(sentence_embedder=model)
#train_loss = losses.BatchHardSoftMarginTripletLoss(sentence_embedder=model)
train_loss = losses.BatchSemiHardTripletLoss(sentence_embedder=model)

logging.info("Read TREC val dataset")
dev_evaluator = TripletEvaluator.from_input_examples(dev_set, name='dev')

logging.info("Performance before fine-tuning:")
dev_evaluator(model)

warmup_steps = int(len(train_dataset) * num_epochs / train_batch_size *
                   0.1)  # 10% of train data

# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=output_path,
Exemplo n.º 8
0
### There are 3 triplet loss variants:
### - BatchHardTripletLoss
### - BatchHardSoftMarginTripletLoss
### - BatchSemiHardTripletLoss
#######################################

#train_loss = losses.BatchHardTripletLoss(sentence_embedder=model)
#train_loss = losses.BatchHardSoftMarginTripletLoss(sentence_embedder=model)
train_loss = losses.BatchSemiHardTripletLoss(sentence_embedder=model)

logging.info("Read TREC val dataset")
dataset_dev = SentenceLabelDataset(examples=val, model=model)
dev_dataloader = DataLoader(dataset_dev,
                            shuffle=False,
                            batch_size=train_batch_size)
evaluator = TripletEvaluator(dev_dataloader)

warmup_steps = int(
    len(train_dataloader) * num_epochs / train_batch_size *
    0.1)  # 10% of train data

# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=1000,
    warmup_steps=warmup_steps,
    output_path=output_path,
)
Exemplo n.º 9
0
import torch
from torch.utils.data import DataLoader, RandomSampler

from scipy.spatial.distance import cdist

torch.cuda.empty_cache()

my_model_path = '/run/media/root/Windows/Users/agnes/Downloads/data/msmarco/train_results/test_wiki'

model_wiki = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens')

dev_dataloader = torch.load(os.path.join(my_model_path, 'dev_dataloader.pth'))
train_dataloader = torch.load(
    os.path.join(my_model_path, 'train_dataloader.pth'))

evaluator = TripletEvaluator(dev_dataloader)

optimizer_class = transformers.AdamW
optimizer_params = {'lr': 2e-4, 'eps': 1e-6, 'correct_bias': False}
train_loss = losses.TripletLoss(model=model_wiki)

num_epochs = 4
warmup_steps = math.ceil(
    len(train_dataloader.dataset) * num_epochs / train_dataloader.batch_size *
    0.05)  #5% of train data for warm-up

model_wiki.fit(train_objectives=[(train_dataloader, train_loss)],
               evaluator=evaluator,
               epochs=num_epochs,
               steps_per_epoch=8000,
               warmup_steps=warmup_steps,
Exemplo n.º 10
0
def train(triplet_data_dir, output):
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])

    ### Create a torch.DataLoader that passes training batch instances to our model
    train_batch_size = 16
    triplet_reader = TripletReader(triplet_data_dir,
                                   s1_col_idx=1,
                                   s2_col_idx=2,
                                   s3_col_idx=3,
                                   delimiter=',',
                                   quoting=csv.QUOTE_MINIMAL,
                                   has_header=True)
    # output_path = "output/bert-base-wikipedia-sections-mean-tokens-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    output_path = output + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    num_epochs = 1

    ### Configure sentence transformers for training and train on the provided dataset
    # Use BERT for mapping tokens to embeddings
    word_embedding_model = models.BERT('bert-base-uncased')

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    logging.info("Read Triplet train dataset")
    train_data = SentencesDataset(examples=triplet_reader.get_examples(
        'train.csv', 2000000),
                                  model=model)
    train_dataloader = DataLoader(train_data,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.TripletLoss(model=model)

    logging.info("Read Wikipedia Triplet dev dataset")
    dev_data = SentencesDataset(examples=triplet_reader.get_examples(
        'validation.csv', 10000),
                                model=model)
    dev_dataloader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=train_batch_size)
    evaluator = TripletEvaluator(dev_dataloader)

    warmup_steps = int(len(train_data) * num_epochs / train_batch_size *
                       0.1)  #10% of train data

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=num_epochs,
              evaluation_steps=1000,
              warmup_steps=warmup_steps,
              output_path=output_path)

    ##############################################################################
    #
    # Load the stored model and evaluate its performance on STS benchmark dataset
    #
    ##############################################################################

    model = SentenceTransformer(output_path)
    test_data = SentencesDataset(
        examples=triplet_reader.get_examples('test.csv'), model=model)
    test_dataloader = DataLoader(test_data,
                                 shuffle=False,
                                 batch_size=train_batch_size)
    evaluator = TripletEvaluator(test_dataloader)

    model.evaluate(evaluator)
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

logging.info("Read Triplet train dataset")
train_dataset = SentencesDataset(examples=triplet_reader.get_examples(
    'train.csv', max_examples=100000),
                                 model=model)
train_dataloader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.TripletLoss(model=model)

logging.info("Read Wikipedia Triplet dev dataset")
evaluator = TripletEvaluator.from_input_examples(triplet_reader.get_examples(
    'validation.csv', 1000),
                                                 name='dev')

warmup_steps = int(len(train_dataset) * num_epochs / train_batch_size *
                   0.1)  #10% of train data

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=output_path)

##############################################################################
#