예제 #1
0
    def __init__(self,
                 document_store: BaseDocumentStore,
                 query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base",
                 passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base",
                 single_model_path: Optional[Union[Path, str]] = None,
                 model_version: Optional[str] = None,
                 max_seq_len_query: int = 64,
                 max_seq_len_passage: int = 256,
                 top_k: int = 10,
                 use_gpu: bool = True,
                 batch_size: int = 16,
                 embed_title: bool = True,
                 use_fast_tokenizers: bool = True,
                 infer_tokenizer_classes: bool = False,
                 similarity_function: str = "dot_product",
                 progress_bar: bool = True
                 ):
        """
        Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
        The checkpoint format matches huggingface transformers' model format

        **Example:**

                ```python
                |    # remote model from FAIR
                |    DensePassageRetriever(document_store=your_doc_store,
                |                          query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                |                          passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base")
                |    # or from local path
                |    DensePassageRetriever(document_store=your_doc_store,
                |                          query_embedding_model="model_directory/question-encoder",
                |                          passage_embedding_model="model_directory/context-encoder")
                ```

        :param document_store: An instance of DocumentStore from which to retrieve documents.
        :param query_embedding_model: Local path or remote name of question encoder checkpoint. The format equals the
                                      one used by hugging-face transformers' modelhub models
                                      Currently available remote names: ``"facebook/dpr-question_encoder-single-nq-base"``
        :param passage_embedding_model: Local path or remote name of passage encoder checkpoint. The format equals the
                                        one used by hugging-face transformers' modelhub models
                                        Currently available remote names: ``"facebook/dpr-ctx_encoder-single-nq-base"``
        :param single_model_path: Local path or remote name of a query and passage embedder in one single model. Those
                                  models are typically trained within FARM.
                                  Currently available remote names: TODO add FARM DPR model to HF modelhub
        :param model_version: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
        :param max_seq_len_query: Longest length of each query sequence. Maximum number of tokens for the query text. Longer ones will be cut down."
        :param max_seq_len_passage: Longest length of each passage/context sequence. Maximum number of tokens for the passage text. Longer ones will be cut down."
        :param top_k: How many documents to return per query.
        :param use_gpu: Whether to use gpu or not
        :param batch_size: Number of questions or passages to encode at once
        :param embed_title: Whether to concatenate title and passage to a text pair that is then used to create the embedding.
                            This is the approach used in the original paper and is likely to improve performance if your
                            titles contain meaningful information for retrieval (topic, entities etc.) .
                            The title is expected to be present in doc.meta["name"] and can be supplied in the documents
                            before writing them to the DocumentStore like this:
                            {"text": "my text", "meta": {"name": "my title"}}.
        :param use_fast_tokenizers: Whether to use fast Rust tokenizers
        :param infer_tokenizer_classes: Whether to infer tokenizer class from the model config / name. 
                                        If `False`, the class always loads `DPRQuestionEncoderTokenizer` and `DPRContextEncoderTokenizer`. 
        :param similarity_function: Which function to apply for calculating the similarity of query and passage embeddings during training. 
                                    Options: `dot_product` (Default) or `cosine`
        :param progress_bar: Whether to show a tqdm progress bar or not.
                             Can be helpful to disable in production deployments to keep the logs clean.
        """

        self.document_store = document_store
        self.batch_size = batch_size
        self.progress_bar = progress_bar
        self.top_k = top_k

        if document_store is None:
           logger.warning("DensePassageRetriever initialized without a document store. "
                          "This is fine if you are performing DPR training. "
                          "Otherwise, please provide a document store in the constructor.")
        elif document_store.similarity != "dot_product":
            logger.warning(f"You are using a Dense Passage Retriever model with the {document_store.similarity} function. "
                           "We recommend you use dot_product instead. "
                           "This can be set when initializing the DocumentStore")

        if use_gpu and torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.infer_tokenizer_classes = infer_tokenizer_classes
        tokenizers_default_classes = {
            "query": "DPRQuestionEncoderTokenizer",
            "passage": "DPRContextEncoderTokenizer"
        }
        if self.infer_tokenizer_classes:
            tokenizers_default_classes["query"] = None   # type: ignore
            tokenizers_default_classes["passage"] = None # type: ignore

        # Init & Load Encoders
        if single_model_path is None:
            self.query_tokenizer = Tokenizer.load(pretrained_model_name_or_path=query_embedding_model,
                                                  revision=model_version,
                                                  do_lower_case=True,
                                                  use_fast=use_fast_tokenizers,
                                                  tokenizer_class=tokenizers_default_classes["query"])
            self.query_encoder = LanguageModel.load(pretrained_model_name_or_path=query_embedding_model,
                                                    revision=model_version,
                                                    language_model_class="DPRQuestionEncoder")
            self.passage_tokenizer = Tokenizer.load(pretrained_model_name_or_path=passage_embedding_model,
                                                    revision=model_version,
                                                    do_lower_case=True,
                                                    use_fast=use_fast_tokenizers,
                                                    tokenizer_class=tokenizers_default_classes["passage"])
            self.passage_encoder = LanguageModel.load(pretrained_model_name_or_path=passage_embedding_model,
                                                      revision=model_version,
                                                      language_model_class="DPRContextEncoder")

            self.processor = TextSimilarityProcessor(query_tokenizer=self.query_tokenizer,
                                                     passage_tokenizer=self.passage_tokenizer,
                                                     max_seq_len_passage=max_seq_len_passage,
                                                     max_seq_len_query=max_seq_len_query,
                                                     label_list=["hard_negative", "positive"],
                                                     metric="text_similarity_metric",
                                                     embed_title=embed_title,
                                                     num_hard_negatives=0,
                                                     num_positives=1)
            prediction_head = TextSimilarityHead(similarity_function=similarity_function)
            self.model = BiAdaptiveModel(
                language_model1=self.query_encoder,
                language_model2=self.passage_encoder,
                prediction_heads=[prediction_head],
                embeds_dropout_prob=0.1,
                lm1_output_types=["per_sequence"],
                lm2_output_types=["per_sequence"],
                device=self.device,
            )
        else:
            self.processor = TextSimilarityProcessor.load_from_dir(single_model_path)
            self.processor.max_seq_len_passage = max_seq_len_passage
            self.processor.max_seq_len_query = max_seq_len_query
            self.processor.embed_title = embed_title
            self.processor.num_hard_negatives = 0
            self.processor.num_positives = 1  # during indexing of documents only one embedding is created
            self.model = BiAdaptiveModel.load(single_model_path, device=self.device)

        self.model.connect_heads_with_processor(self.processor.tasks, require_labels=False)
예제 #2
0
def test_dpr_training():
    batch_size = 1
    n_epochs = 1
    distributed = False  # enable for multi GPU training via DDP
    evaluate_every = 1
    question_lang_model = "microsoft/MiniLM-L12-H384-uncased"
    passage_lang_model = "microsoft/MiniLM-L12-H384-uncased"
    do_lower_case = True
    use_fast = True
    similarity_function = "dot_product"

    device, n_gpu = initialize_device_settings(use_cuda=False)

    query_tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=question_lang_model,
        do_lower_case=do_lower_case,
        use_fast=use_fast)
    passage_tokenizer = Tokenizer.load(
        pretrained_model_name_or_path=passage_lang_model,
        do_lower_case=do_lower_case,
        use_fast=use_fast)
    label_list = ["hard_negative", "positive"]

    processor = TextSimilarityProcessor(query_tokenizer=query_tokenizer,
                                        passage_tokenizer=passage_tokenizer,
                                        max_seq_len_query=10,
                                        max_seq_len_passage=10,
                                        label_list=label_list,
                                        metric="text_similarity_metric",
                                        data_dir="samples/dpr/",
                                        train_filename="sample.json",
                                        dev_filename="sample.json",
                                        test_filename=None,
                                        embed_title=True,
                                        num_hard_negatives=1,
                                        dev_split=0,
                                        max_samples=2)

    data_silo = DataSilo(processor=processor,
                         batch_size=batch_size,
                         distributed=False)

    question_language_model = LanguageModel.load(
        pretrained_model_name_or_path=question_lang_model,
        language_model_class="DPRQuestionEncoder")
    passage_language_model = LanguageModel.load(
        pretrained_model_name_or_path=passage_lang_model,
        language_model_class="DPRContextEncoder")

    prediction_head = TextSimilarityHead(
        similarity_function=similarity_function)

    model = BiAdaptiveModel(
        language_model1=question_language_model,
        language_model2=passage_language_model,
        prediction_heads=[prediction_head],
        embeds_dropout_prob=0.1,
        lm1_output_types=["per_sequence"],
        lm2_output_types=["per_sequence"],
        device=device,
    )

    model, optimizer, lr_schedule = initialize_optimizer(
        model=model,
        learning_rate=1e-5,
        optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
                        "eps": 1e-08},
        schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        grad_acc_steps=1,
        device=device,
        distributed=distributed
    )

    trainer = Trainer(
        model=model,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    trainer.train()

    ######## save and load model again
    save_dir = Path("testsave/dpr-model")
    model.save(save_dir)
    del model

    model2 = BiAdaptiveModel.load(save_dir, device=device)
    model2, optimizer2, lr_schedule = initialize_optimizer(
        model=model2,
        learning_rate=1e-5,
        optimizer_opts={"name": "TransformersAdamW", "correct_bias": True, "weight_decay": 0.0, \
                        "eps": 1e-08},
        schedule_opts={"name": "LinearWarmup", "num_warmup_steps": 100},
        n_batches=len(data_silo.loaders["train"]),
        n_epochs=n_epochs,
        grad_acc_steps=1,
        device=device,
        distributed=distributed
    )
    trainer2 = Trainer(
        model=model2,
        optimizer=optimizer,
        data_silo=data_silo,
        epochs=n_epochs,
        n_gpu=n_gpu,
        lr_schedule=lr_schedule,
        evaluate_every=evaluate_every,
        device=device,
    )

    trainer2.train()