Пример #1
0
    def _prepare_model(self, encoder, saved_state, prefix):
        encoder.to(self.device)
        if self.use_amp:
            try:
                import apex
                from apex import amp
                apex.amp.register_half_function(torch, "einsum")
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )

            encoder, _ = amp.initialize(encoder, None, opt_level=self.use_amp)

        encoder.eval()

        # load weights from the model file
        model_to_load = encoder.module if hasattr(encoder,
                                                  'module') else encoder
        logger.info('Loading saved model state ...')
        logger.debug('saved model keys =%s', saved_state.model_dict.keys())

        prefix_len = len(prefix)
        ctx_state = {
            key[prefix_len:]: value
            for (key, value) in saved_state.model_dict.items()
            if key.startswith(prefix)
        }
        model_to_load.load_state_dict(ctx_state)
        return encoder
Пример #2
0
    def _generate_batch_predictions(
            self,
            texts: List[str],
            model: torch.nn.Module,
            tensorizer: Tensorizer,
            batch_size: int = 16) -> List[Tuple[object, np.array]]:
        n = len(texts)
        total = 0
        results = []
        for j, batch_start in enumerate(range(0, n, batch_size)):

            batch_token_tensors = [
                tensorizer.text_to_tensor(ctx)
                for ctx in texts[batch_start:batch_start + batch_size]
            ]

            ctx_ids_batch = torch.stack(batch_token_tensors,
                                        dim=0).to(self.device)
            ctx_seg_batch = torch.zeros_like(ctx_ids_batch).to(self.device)
            ctx_attn_mask = tensorizer.get_attn_mask(ctx_ids_batch).to(
                self.device)
            with torch.no_grad():
                _, out, _ = model(ctx_ids_batch, ctx_seg_batch, ctx_attn_mask)
            out = out.cpu()

            total += len(batch_token_tensors)

            results.extend([(out[i].view(-1).numpy())
                            for i in range(out.size(0))])

            if total % 10 == 0:
                logger.info(f'Embedded {total} / {n} texts')

        return results
Пример #3
0
    def __init__(
        self,
        document_store: BaseDocumentStore,
        embedding_model: str,
        use_gpu: bool = True,
        model_format: str = "farm",
        pooling_strategy: str = "reduce_mean",
        emb_extraction_layer: int = -1,
    ):
        """
        :param document_store: An instance of DocumentStore from which to retrieve documents.
        :param embedding_model: Local path or name of model in Hugging Face's model hub. Example: 'deepset/sentence_bert'
        :param use_gpu: Whether to use gpu or not
        :param model_format: Name of framework that was used for saving the model. Options: 'farm', 'transformers', 'sentence_transformers'
        :param pooling_strategy: Strategy for combining the embeddings from the model (for farm / transformers models only).
                                 Options: 'cls_token' (sentence vector), 'reduce_mean' (sentence vector),
                                 reduce_max (sentence vector), 'per_token' (individual token vectors)
        :param emb_extraction_layer: Number of layer from which the embeddings shall be extracted (for farm / transformers models only).
                                     Default: -1 (very last layer).
        """
        self.document_store = document_store
        self.model_format = model_format
        self.embedding_model = embedding_model
        self.pooling_strategy = pooling_strategy
        self.emb_extraction_layer = emb_extraction_layer

        logger.info(
            f"Init retriever using embeddings of model {embedding_model}")
        if model_format == "farm" or model_format == "transformers":
            self.embedding_model = Inferencer.load(
                embedding_model,
                task_type="embeddings",
                extraction_strategy=self.pooling_strategy,
                extraction_layer=self.emb_extraction_layer,
                gpu=use_gpu,
                batch_size=4,
                max_seq_len=512,
                num_processes=0)

        elif model_format == "sentence_transformers":
            from sentence_transformers import SentenceTransformer

            # pretrained embedding models coming from: https://github.com/UKPLab/sentence-transformers#pretrained-models
            # e.g. 'roberta-base-nli-stsb-mean-tokens'
            if use_gpu:
                device = "cuda"
            else:
                device = "cpu"
            self.embedding_model = SentenceTransformer(embedding_model,
                                                       device=device)
        else:
            raise NotImplementedError
Пример #4
0
    def _generate_batch_predictions(
        self,
        texts: List[str],
        model: torch.nn.Module,
        tokenizer: Union[DPRQuestionEncoderTokenizer,
                         DPRContextEncoderTokenizer],
        titles: Optional[
            List[str]] = None,  #useful only for passage embedding with DPR!
        batch_size: int = 16
    ) -> List[Tuple[object, np.array]]:
        n = len(texts)
        total = 0
        results = []
        for batch_start in range(0, n, batch_size):
            # create batch of titles only for passages
            ctx_title = None
            if self.embed_title and titles:
                ctx_title = titles[batch_start:batch_start + batch_size]

            # create batch of text
            ctx_text = texts[batch_start:batch_start + batch_size]

            # tensorize the batch
            ctx_ids_batch, _, ctx_attn_mask = self._tensorizer(tokenizer,
                                                               text=ctx_text,
                                                               title=ctx_title)
            ctx_seg_batch = torch.zeros_like(ctx_ids_batch).to(self.device)

            # remove [SEP] token from untitled passages in batch
            if self.embed_title and self.remove_sep_tok_from_untitled_passages and ctx_title:
                ctx_ids_batch, ctx_attn_mask = self._remove_sep_tok_from_untitled_passages(
                    ctx_title, ctx_ids_batch, ctx_attn_mask)

            with torch.no_grad():
                out = model(input_ids=ctx_ids_batch,
                            attention_mask=ctx_attn_mask,
                            token_type_ids=ctx_seg_batch)
                # TODO revert back to when updating transformers
                # out = out.pooler_output
                out = out[0]
            out = out.cpu()

            total += ctx_ids_batch.size()[0]

            results.extend([(out[i].view(-1).numpy())
                            for i in range(out.size(0))])

            if total % 10 == 0:
                logger.info(f'Embedded {total} / {n} texts')

        return results
Пример #5
0
    def _generate_batch_predictions(
        self,
        texts: List[str],
        model: torch.nn.Module,
        tensorizer: Tensorizer,
        titles: Optional[
            List[str]] = None,  #useful only for passage embedding with DPR!
        batch_size: int = 16
    ) -> List[Tuple[object, np.array]]:
        n = len(texts)
        total = 0
        results = []
        for j, batch_start in enumerate(range(0, n, batch_size)):

            if model == self.passage_encoder and titles:
                batch_token_tensors = [
                    tensorizer.text_to_tensor(text=ctx_text, title=ctx_title)
                    for ctx_text, ctx_title in zip(
                        texts[batch_start:batch_start +
                              batch_size], titles[batch_start:batch_start +
                                                  batch_size])
                ]
            else:
                batch_token_tensors = [
                    tensorizer.text_to_tensor(text=ctx_text)
                    for ctx_text in texts[batch_start:batch_start + batch_size]
                ]

            ctx_ids_batch = torch.stack(batch_token_tensors,
                                        dim=0).to(self.device)
            ctx_seg_batch = torch.zeros_like(ctx_ids_batch).to(self.device)
            ctx_attn_mask = tensorizer.get_attn_mask(ctx_ids_batch).to(
                self.device)
            with torch.no_grad():
                _, out, _ = model(ctx_ids_batch, ctx_seg_batch, ctx_attn_mask)
            out = out.cpu()

            total += len(batch_token_tensors)

            results.extend([(out[i].view(-1).numpy())
                            for i in range(out.size(0))])

            if total % 10 == 0:
                logger.info(f'Embedded {total} / {n} texts')

        return results
Пример #6
0
    def __init__(
        self,
        document_store: BaseDocumentStore,
        embedding_model: str,
        use_gpu: bool = True,
        batch_size: int = 16,
        do_lower_case: bool = False,
        use_amp: str = None,
    ):
        """
        Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
        The checkpoint format matches the one of the original author's in the repository (https://github.com/facebookresearch/DPR)
        See their readme for manual download instructions: https://github.com/facebookresearch/DPR#resources--data-formats

        :Example:

            # remote model from FAIR
            >>> DensePassageRetriever(document_store=your_doc_store, embedding_model="dpr-bert-base-nq", use_gpu=True)
            # or from local path
            >>> DensePassageRetriever(document_store=your_doc_store, embedding_model="some_path/ber-base-encoder.cp", use_gpu=True)

        :param document_store: An instance of DocumentStore from which to retrieve documents.
        :param embedding_model: Local path or remote name of model checkpoint. The format equals the 
                                one used by original author's in https://github.com/facebookresearch/DPR. 
                                Currently available remote names: "dpr-bert-base-nq" 
        :param use_gpu: Whether to use gpu or not
        :param batch_size: Number of questions or passages to encode at once
        :param do_lower_case: Whether to lower case the text input in the tokenizer
        :param encoder_model_type: 
        :param use_amp: Whether to use Automatix Mixed Precision optimization from apex's to improve speed and memory consumption.
        :param use_amp: Optional usage of Automatix Mixed Precision optimization from apex's to improve speed and memory consumption.
                        Choose `None` or AMP optimization level:
                              - None -> Not using amp at all
                              - 'O0' -> Regular FP32
                              - 'O1' -> Mixed Precision (recommended, if optimization wanted)
        """

        self.document_store = document_store
        self.embedding_model = embedding_model
        self.batch_size = batch_size

        #TODO Proper Download + Caching of model if not locally available
        if embedding_model == "dpr-bert-base-nq":
            if not Path(
                    "models/dpr/checkpoint/retriever/single/nq/bert-base-encoder.cp"
            ).is_file():
                download_dpr(
                    resource_key=
                    "checkpoint.retriever.single.nq.bert-base-encoder",
                    out_dir="models/dpr")
            self.embedding_model = "models/dpr/checkpoint/retriever/single/nq/bert-base-encoder.cp"

        if use_gpu and torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.use_amp = use_amp
        self.do_lower_case = do_lower_case

        # Load checkpoint (incl. additional model params)
        saved_state = load_states_from_checkpoint(self.embedding_model)
        logger.info('Loaded encoder params:  %s', saved_state.encoder_params)
        self.do_lower_case = saved_state.encoder_params["do_lower_case"]
        self.pretrained_model_cfg = saved_state.encoder_params[
            "pretrained_model_cfg"]
        self.encoder_model_type = saved_state.encoder_params[
            "encoder_model_type"]
        self.pretrained_file = saved_state.encoder_params["pretrained_file"]
        self.projection_dim = saved_state.encoder_params["projection_dim"]
        self.sequence_length = saved_state.encoder_params["sequence_length"]

        # Init & Load Encoders
        self.query_encoder = HFBertEncoder.init_encoder(
            self.pretrained_model_cfg,
            projection_dim=self.projection_dim,
            dropout=0.0)
        self.passage_encoder = HFBertEncoder.init_encoder(
            self.pretrained_model_cfg,
            projection_dim=self.projection_dim,
            dropout=0.0)
        self.passage_encoder = self._prepare_model(self.passage_encoder,
                                                   saved_state,
                                                   prefix="ctx_model.")
        self.query_encoder = self._prepare_model(self.query_encoder,
                                                 saved_state,
                                                 prefix="question_model.")
        #self.encoder = BiEncoder(question_encoder, ctx_encoder, fix_ctx_encoder=self.fix_ctx_encoder)

        # Load Tokenizer & Tensorizer
        tokenizer = BertTokenizer.from_pretrained(
            self.pretrained_model_cfg, do_lower_case=self.do_lower_case)
        self.tensorizer = BertTensorizer(tokenizer, self.sequence_length)
Пример #7
0
    def __init__(
        self,
        document_store: BaseDocumentStore,
        query_embedding_model: str,
        passage_embedding_model: str,
        max_seq_len: int = 256,
        use_gpu: bool = True,
        batch_size: int = 16,
        embed_title: bool = True,
        remove_sep_tok_from_untitled_passages: bool = True,
        model_type: str = "dpr",
        pad_to_max_length: bool = True,
    ):
        """
        Init the Retriever incl. the two encoder models from a local or remote model checkpoint.
        The checkpoint format matches huggingface transformers' model format

        :Example:

            # remote model from FAIR
            >>> DensePassageRetriever(document_store=your_doc_store,
                                      query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
                                      passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
                                      use_gpu=True)
            # or from local path
            >>> DensePassageRetriever(document_store=your_doc_store,
                                      query_embedding_model="local-path/query-checkpoint",
                                      passage_embedding_model="local-path/ctx-checkpoint",
                                      use_gpu=True)
        :param document_store: An instance of DocumentStore from which to retrieve documents.
        :param query_embedding_model: Local path or remote name of question encoder checkpoint. The format equals the
                                one used by hugging-face transformers' modelhub models
                                Currently available remote names: "facebook/dpr-question_encoder-single-nq-base"
        :param passage_embedding_model: Local path or remote name of passage encoder checkpoint. The format equals the
                                one used by hugging-face transformers' modelhub models
                                Currently available remote names: "facebook/dpr-ctx_encoder-single-nq-base"
        :param max_seq_len: Longest length of each sequence
        :param use_gpu: Whether to use gpu or not
        :param batch_size: Number of questions or passages to encode at once
        :param embed_title: Whether to concatenate title and passage to a text pair that is then used to create the embedding   
        :param remove_sep_tok_from_untitled_passages: If embed_title is true, there are different strategies to deal with documents that don't have a title.
                                                      True => Embed passage as single text, similar to embed_title = False (i.e [CLS] passage_tok1 ... [SEP])
                                                      False => Embed passage as text pair with empty title (i.e. [CLS] [SEP] passage_tok1 ... [SEP])
        :param pad_to_max_length: Whether to add padding or not                                               
        """

        self.document_store = document_store
        self.batch_size = batch_size
        self.max_seq_len = max_seq_len

        if use_gpu and torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.embed_title = embed_title
        self.remove_sep_tok_from_untitled_passages = remove_sep_tok_from_untitled_passages

        self.model_type = model_type.upper()
        # Init & Load Encoders

        #1. Load Tokenizer
        #NB: I will use always the same Tokenizer (even though I will switch between checkpoints)
        self.query_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained(
            "facebook/dpr-question_encoder-single-nq-base")
        self.passage_tokenizer = DPRContextEncoderTokenizer.from_pretrained(
            "facebook/dpr-ctx_encoder-single-nq-base")

        #2. Load Model
        valid_model_types = ["DPR", "ORQA", "REALM"]

        if self.model_type in valid_model_types:
            self.query_encoder = DPRQuestionEncoder.from_pretrained(
                query_embedding_model).to(self.device)
            self.passage_encoder = DPRContextEncoder.from_pretrained(
                passage_embedding_model).to(self.device)
        else:
            raise NotImplementedError

        self.pad_to_max_length = pad_to_max_length

        self.debug_mode = False  #Set it from outside (TMP)

        logger.info(f"BiEncoder implementation with {self.model_type}")