Exemplo n.º 1
0
def cli_main():
    # 作者在issues里提到的多语言的预训练模型 xlm-r-40langs-bert-base-nli-stsb-mean-tokens
    # 针对信息检索任务的多语言预训练模型  distilbert-multilingual-nli-stsb-quora-ranking
    model = SentenceTransformer(
        'distilbert-multilingual-nli-stsb-quora-ranking')

    num_epochs = 10
    train_batch_size = 64
    model_save_path = os.path.join(
        cur_dir, 'output/training_MultipleNegativesRankingLoss-' +
        datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    os.makedirs(model_save_path, exist_ok=True)

    colab_dir = "/content/drive/My Drive/data/nlp"
    data_file = os.path.join(colab_dir, "LCCC-large.json")
    train_samples = get_data(data_file)

    # After reading the train_samples, we create a SentencesDataset and a DataLoader
    train_dataset = SentencesDataset(train_samples, model=model)
    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.MultipleNegativesRankingLoss(model)

    ###### Duplicate Questions Information Retrieval ######
    evaluators = []
    data_file = os.path.join(colab_dir, "STC.json")
    max_ir_num = 5000
    max_corpus_size = 100000
    ir_queries, ir_corpus, ir_relevant_docs = get_iq_corpus(
        data_file, max_ir_num, max_corpus_size)

    ir_evaluator = evaluation.InformationRetrievalEvaluator(
        ir_queries, ir_corpus, ir_relevant_docs)
    evaluators.append(ir_evaluator)
    seq_evaluator = evaluation.SequentialEvaluator(
        evaluators, main_score_function=lambda scores: scores[-1])

    logging.info("Evaluate model without training")
    seq_evaluator(model, epoch=0, steps=0, output_path=model_save_path)

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=seq_evaluator,
              epochs=num_epochs,
              warmup_steps=1000,
              output_path=model_save_path,
              output_path_ignore_not_empty=True)
Exemplo n.º 2
0
other_qid_list = list(distraction_questions.keys())
random.shuffle(other_qid_list)

for qid in other_qid_list[0:max(0, max_corpus_size - len(ir_corpus))]:
    ir_corpus[qid] = distraction_questions[qid]

#Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR
# metrices. For our use case MRR@k and Accuracy@k are relevant.
ir_evaluator = evaluation.InformationRetrievalEvaluator(
    ir_queries, ir_corpus, ir_relevant_docs)

evaluators.append(ir_evaluator)

# Create a SequentialEvaluator. This SequentialEvaluator runs all three evaluators in a sequential order.
# We optimize the model with respect to the score from the last evaluator (scores[-1])
seq_evaluator = evaluation.SequentialEvaluator(
    evaluators, main_score_function=lambda scores: scores[-1])

logging.info("Evaluate model without training")
seq_evaluator(model, epoch=0, steps=0, output_path=model_save_path)

# Train the model
model.fit(train_objectives=[(train_dataloader_MultipleNegativesRankingLoss,
                             train_loss_MultipleNegativesRankingLoss),
                            (train_dataloader_ConstrativeLoss,
                             train_loss_ConstrativeLoss)],
          evaluator=seq_evaluator,
          epochs=num_epochs,
          warmup_steps=1000,
          output_path=model_save_path,
          output_path_ignore_not_empty=True)
                    sent1, sent2, score = line.strip().split("\t")
                    score = float(score)
                    sts_data[filename]['sentences1'].append(sent1)
                    sts_data[filename]['sentences2'].append(sent2)
                    sts_data[filename]['scores'].append(score)

for filename, data in sts_data.items():
    test_evaluator = evaluation.EmbeddingSimilarityEvaluator(
        data['sentences1'],
        data['sentences2'],
        data['scores'],
        batch_size=inference_batch_size,
        name=filename,
        show_progress_bar=False)
    evaluators.append(test_evaluator)

# Train the model
student_model.fit(train_objectives=[(train_dataloader, train_loss)],
                  evaluator=evaluation.SequentialEvaluator(
                      evaluators,
                      main_score_function=lambda scores: np.mean(scores)),
                  epochs=num_epochs,
                  warmup_steps=num_warmup_steps,
                  evaluation_steps=num_evaluation_steps,
                  output_path=output_path,
                  save_best_model=True,
                  optimizer_params={
                      'lr': 2e-5,
                      'eps': 1e-6,
                      'correct_bias': False
                  })
    logging.info("Teacher Performance with {} dimensions:".format(teacher_model.get_sentence_embedding_dimension()))
    dev_evaluator_sts(teacher_model)



# We train the student_model such that it creates sentence embeddings similar to the embeddings from the teacher_model
# For this, we need a large set of sentences. These sentences are embedded using the teacher model,
# and the student tries to mimic these embeddings. It is the same approach as used in: https://arxiv.org/abs/2004.09813
train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=False)
train_data.add_dataset([[sent] for sent in train_sentences_nli], max_sentence_length=256)
train_data.add_dataset([[sent] for sent in train_sentences_wikipedia], max_sentence_length=256)

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=student_model)

# We create an evaluator, that measure the Mean Squared Error (MSE) between the teacher and the student embeddings
dev_sentences = dev_sentences_nli + dev_sentences_wikipedia
dev_evaluator_mse = evaluation.MSEEvaluator(dev_sentences, dev_sentences, teacher_model=teacher_model)

# Train the student model to imitate the teacher
student_model.fit(train_objectives=[(train_dataloader, train_loss)],
                  evaluator=evaluation.SequentialEvaluator([dev_evaluator_sts, dev_evaluator_mse]),
                  epochs=1,
                  warmup_steps=1000,
                  evaluation_steps=5000,
                  output_path=output_path,
                  save_best_model=True,
                  optimizer_params={'lr': 1e-4, 'eps': 1e-6, 'correct_bias': False},
                  use_amp=True)