def cli_main(): # 作者在issues里提到的多语言的预训练模型 xlm-r-40langs-bert-base-nli-stsb-mean-tokens # 针对信息检索任务的多语言预训练模型 distilbert-multilingual-nli-stsb-quora-ranking model = SentenceTransformer( 'distilbert-multilingual-nli-stsb-quora-ranking') num_epochs = 10 train_batch_size = 64 model_save_path = os.path.join( cur_dir, 'output/training_MultipleNegativesRankingLoss-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) os.makedirs(model_save_path, exist_ok=True) colab_dir = "/content/drive/My Drive/data/nlp" data_file = os.path.join(colab_dir, "LCCC-large.json") train_samples = get_data(data_file) # After reading the train_samples, we create a SentencesDataset and a DataLoader train_dataset = SentencesDataset(train_samples, model=model) train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) train_loss = losses.MultipleNegativesRankingLoss(model) ###### Duplicate Questions Information Retrieval ###### evaluators = [] data_file = os.path.join(colab_dir, "STC.json") max_ir_num = 5000 max_corpus_size = 100000 ir_queries, ir_corpus, ir_relevant_docs = get_iq_corpus( data_file, max_ir_num, max_corpus_size) ir_evaluator = evaluation.InformationRetrievalEvaluator( ir_queries, ir_corpus, ir_relevant_docs) evaluators.append(ir_evaluator) seq_evaluator = evaluation.SequentialEvaluator( evaluators, main_score_function=lambda scores: scores[-1]) logging.info("Evaluate model without training") seq_evaluator(model, epoch=0, steps=0, output_path=model_save_path) # Train the model model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=seq_evaluator, epochs=num_epochs, warmup_steps=1000, output_path=model_save_path, output_path_ignore_not_empty=True)
other_qid_list = list(distraction_questions.keys()) random.shuffle(other_qid_list) for qid in other_qid_list[0:max(0, max_corpus_size - len(ir_corpus))]: ir_corpus[qid] = distraction_questions[qid] #Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR # metrices. For our use case MRR@k and Accuracy@k are relevant. ir_evaluator = evaluation.InformationRetrievalEvaluator( ir_queries, ir_corpus, ir_relevant_docs) evaluators.append(ir_evaluator) # Create a SequentialEvaluator. This SequentialEvaluator runs all three evaluators in a sequential order. # We optimize the model with respect to the score from the last evaluator (scores[-1]) seq_evaluator = evaluation.SequentialEvaluator( evaluators, main_score_function=lambda scores: scores[-1]) logging.info("Evaluate model without training") seq_evaluator(model, epoch=0, steps=0, output_path=model_save_path) # Train the model model.fit(train_objectives=[(train_dataloader_MultipleNegativesRankingLoss, train_loss_MultipleNegativesRankingLoss), (train_dataloader_ConstrativeLoss, train_loss_ConstrativeLoss)], evaluator=seq_evaluator, epochs=num_epochs, warmup_steps=1000, output_path=model_save_path, output_path_ignore_not_empty=True)
sent1, sent2, score = line.strip().split("\t") score = float(score) sts_data[filename]['sentences1'].append(sent1) sts_data[filename]['sentences2'].append(sent2) sts_data[filename]['scores'].append(score) for filename, data in sts_data.items(): test_evaluator = evaluation.EmbeddingSimilarityEvaluator( data['sentences1'], data['sentences2'], data['scores'], batch_size=inference_batch_size, name=filename, show_progress_bar=False) evaluators.append(test_evaluator) # Train the model student_model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluation.SequentialEvaluator( evaluators, main_score_function=lambda scores: np.mean(scores)), epochs=num_epochs, warmup_steps=num_warmup_steps, evaluation_steps=num_evaluation_steps, output_path=output_path, save_best_model=True, optimizer_params={ 'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False })
logging.info("Teacher Performance with {} dimensions:".format(teacher_model.get_sentence_embedding_dimension())) dev_evaluator_sts(teacher_model) # We train the student_model such that it creates sentence embeddings similar to the embeddings from the teacher_model # For this, we need a large set of sentences. These sentences are embedded using the teacher model, # and the student tries to mimic these embeddings. It is the same approach as used in: https://arxiv.org/abs/2004.09813 train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=False) train_data.add_dataset([[sent] for sent in train_sentences_nli], max_sentence_length=256) train_data.add_dataset([[sent] for sent in train_sentences_wikipedia], max_sentence_length=256) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.MSELoss(model=student_model) # We create an evaluator, that measure the Mean Squared Error (MSE) between the teacher and the student embeddings dev_sentences = dev_sentences_nli + dev_sentences_wikipedia dev_evaluator_mse = evaluation.MSEEvaluator(dev_sentences, dev_sentences, teacher_model=teacher_model) # Train the student model to imitate the teacher student_model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluation.SequentialEvaluator([dev_evaluator_sts, dev_evaluator_mse]), epochs=1, warmup_steps=1000, evaluation_steps=5000, output_path=output_path, save_best_model=True, optimizer_params={'lr': 1e-4, 'eps': 1e-6, 'correct_bias': False}, use_amp=True)