Python SequentialEvaluator 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: sentence_transformers.evaluation

메소드/함수: SequentialEvaluator

hotexamples.com에서의 예제들: 4

Python SequentialEvaluator - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 sentence_transformers.evaluation.SequentialEvaluator에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def cli_main():
    # 作者在issues里提到的多语言的预训练模型 xlm-r-40langs-bert-base-nli-stsb-mean-tokens
    # 针对信息检索任务的多语言预训练模型  distilbert-multilingual-nli-stsb-quora-ranking
    model = SentenceTransformer(
        'distilbert-multilingual-nli-stsb-quora-ranking')

    num_epochs = 10
    train_batch_size = 64
    model_save_path = os.path.join(
        cur_dir, 'output/training_MultipleNegativesRankingLoss-' +
        datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    os.makedirs(model_save_path, exist_ok=True)

    colab_dir = "/content/drive/My Drive/data/nlp"
    data_file = os.path.join(colab_dir, "LCCC-large.json")
    train_samples = get_data(data_file)

    # After reading the train_samples, we create a SentencesDataset and a DataLoader
    train_dataset = SentencesDataset(train_samples, model=model)
    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.MultipleNegativesRankingLoss(model)

    ###### Duplicate Questions Information Retrieval ######
    evaluators = []
    data_file = os.path.join(colab_dir, "STC.json")
    max_ir_num = 5000
    max_corpus_size = 100000
    ir_queries, ir_corpus, ir_relevant_docs = get_iq_corpus(
        data_file, max_ir_num, max_corpus_size)

    ir_evaluator = evaluation.InformationRetrievalEvaluator(
        ir_queries, ir_corpus, ir_relevant_docs)
    evaluators.append(ir_evaluator)
    seq_evaluator = evaluation.SequentialEvaluator(
        evaluators, main_score_function=lambda scores: scores[-1])

    logging.info("Evaluate model without training")
    seq_evaluator(model, epoch=0, steps=0, output_path=model_save_path)

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=seq_evaluator,
              epochs=num_epochs,
              warmup_steps=1000,
              output_path=model_save_path,
              output_path_ignore_not_empty=True)

예제 #2

파일 보기

other_qid_list = list(distraction_questions.keys())
random.shuffle(other_qid_list)

for qid in other_qid_list[0:max(0, max_corpus_size - len(ir_corpus))]:
    ir_corpus[qid] = distraction_questions[qid]

#Given queries, a corpus and a mapping with relevant documents, the InformationRetrievalEvaluator computes different IR
# metrices. For our use case MRR@k and Accuracy@k are relevant.
ir_evaluator = evaluation.InformationRetrievalEvaluator(
    ir_queries, ir_corpus, ir_relevant_docs)

evaluators.append(ir_evaluator)

# Create a SequentialEvaluator. This SequentialEvaluator runs all three evaluators in a sequential order.
# We optimize the model with respect to the score from the last evaluator (scores[-1])
seq_evaluator = evaluation.SequentialEvaluator(
    evaluators, main_score_function=lambda scores: scores[-1])

logging.info("Evaluate model without training")
seq_evaluator(model, epoch=0, steps=0, output_path=model_save_path)

# Train the model
model.fit(train_objectives=[(train_dataloader_MultipleNegativesRankingLoss,
                             train_loss_MultipleNegativesRankingLoss),
                            (train_dataloader_ConstrativeLoss,
                             train_loss_ConstrativeLoss)],
          evaluator=seq_evaluator,
          epochs=num_epochs,
          warmup_steps=1000,
          output_path=model_save_path,
          output_path_ignore_not_empty=True)

예제 #3

파일 보기

파일: make_multilingual.py 프로젝트: ssundaranathan/sentence-transformers

                    sent1, sent2, score = line.strip().split("\t")
                    score = float(score)
                    sts_data[filename]['sentences1'].append(sent1)
                    sts_data[filename]['sentences2'].append(sent2)
                    sts_data[filename]['scores'].append(score)

for filename, data in sts_data.items():
    test_evaluator = evaluation.EmbeddingSimilarityEvaluator(
        data['sentences1'],
        data['sentences2'],
        data['scores'],
        batch_size=inference_batch_size,
        name=filename,
        show_progress_bar=False)
    evaluators.append(test_evaluator)

# Train the model
student_model.fit(train_objectives=[(train_dataloader, train_loss)],
                  evaluator=evaluation.SequentialEvaluator(
                      evaluators,
                      main_score_function=lambda scores: np.mean(scores)),
                  epochs=num_epochs,
                  warmup_steps=num_warmup_steps,
                  evaluation_steps=num_evaluation_steps,
                  output_path=output_path,
                  save_best_model=True,
                  optimizer_params={
                      'lr': 2e-5,
                      'eps': 1e-6,
                      'correct_bias': False
                  })

예제 #4

파일 보기

파일: model_distillation.py 프로젝트: stjordanis/sentence-transformers

    logging.info("Teacher Performance with {} dimensions:".format(teacher_model.get_sentence_embedding_dimension()))
    dev_evaluator_sts(teacher_model)



# We train the student_model such that it creates sentence embeddings similar to the embeddings from the teacher_model
# For this, we need a large set of sentences. These sentences are embedded using the teacher model,
# and the student tries to mimic these embeddings. It is the same approach as used in: https://arxiv.org/abs/2004.09813
train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=False)
train_data.add_dataset([[sent] for sent in train_sentences_nli], max_sentence_length=256)
train_data.add_dataset([[sent] for sent in train_sentences_wikipedia], max_sentence_length=256)

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=student_model)

# We create an evaluator, that measure the Mean Squared Error (MSE) between the teacher and the student embeddings
dev_sentences = dev_sentences_nli + dev_sentences_wikipedia
dev_evaluator_mse = evaluation.MSEEvaluator(dev_sentences, dev_sentences, teacher_model=teacher_model)

# Train the student model to imitate the teacher
student_model.fit(train_objectives=[(train_dataloader, train_loss)],
                  evaluator=evaluation.SequentialEvaluator([dev_evaluator_sts, dev_evaluator_mse]),
                  epochs=1,
                  warmup_steps=1000,
                  evaluation_steps=5000,
                  output_path=output_path,
                  save_best_model=True,
                  optimizer_params={'lr': 1e-4, 'eps': 1e-6, 'correct_bias': False},
                  use_amp=True)