Exemplo n.º 1
0
    config = yaml.safe_load(file)

modelName = config["model"]
sqlUrl = config["sqlUrl"]
modelDir = config["modelDir"]
doc_dir = os.path.join(dirname, config["docDir"])

# download and save the model
from farm.modeling.adaptive_model import AdaptiveModel
from farm.data_handler.processor import Processor

model = AdaptiveModel.convert_from_transformers(modelName,
                                                device="cpu",
                                                task_type="question_answering")
processor = Processor.convert_from_transformers(modelName,
                                                task_type="question_answering",
                                                max_seq_len=384,
                                                doc_stride=128)
model.save(modelDir)
processor.save(modelDir)

# try:
# //or we should config the cache_dir on from_pretrain # dont rely on cache , must save it with save()
# model = TransformersReader(model_name_or_path=modelName, use_gpu=-1)
# model = FARMReader(model_name_or_path=modelName,
#                                use_gpu=False, no_ans_boost=0)
# except:
#   pass;

document_store = SQLDocumentStore(url=sqlUrl)
# document_store = FAISSDocumentStore(sql_url=sqlUrl)
Exemplo n.º 2
0
    def convert_to_onnx(cls,
                        model_name,
                        output_path,
                        task_type,
                        convert_to_float16=False,
                        quantize=False,
                        opset_version=11):
        """
        Convert a PyTorch model from transformers hub to an ONNX Model.

        :param model_name: transformers model name
        :type model_name: str
        :param output_path: output Path to write the converted to
        :type output_path: Path
        :param task_type: Type of task for the model. Available options: "embeddings", "question_answering",
                          "text_classification", "ner".
        :param convert_to_float16: By default, the model use float32 precision. With half precision of flaot16, inference
                                should be faster on Nvidia GPUs with Tensor core like T4 or V100. On older GPUs, float32
                                might be more performant.
        :type convert_to_float16: bool
        :param quantize: convert floating point number to integers
        :type quantize: bool
        :param opset_version: ONNX opset version
        :type opset_version: int
        :return:
        """
        language_model_class = LanguageModel.get_language_model_class(
            model_name)
        if language_model_class not in ["Bert", "Roberta", "XLMRoberta"]:
            raise Exception(
                "The current ONNX conversion only support 'BERT', 'RoBERTa', and 'XLMRoberta' models."
            )

        task_type_to_pipeline_map = {
            "question_answering": "question-answering",
            "embeddings": "feature-extraction",
            "ner": "ner"
        }

        convert(pipeline_name=task_type_to_pipeline_map[task_type],
                framework="pt",
                model=model_name,
                output=output_path / "model.onnx",
                opset=opset_version,
                use_external_format=True
                if language_model_class is "XLMRoberta" else False)

        # save processor & model config files that are needed when loading the model with the FARM Inferencer
        processor = Processor.convert_from_transformers(
            tokenizer_name_or_path=model_name,
            task_type=task_type,
            max_seq_len=256,
            doc_stride=128,
            use_fast=True)
        processor.save(output_path)
        model = AdaptiveModel.convert_from_transformers(model_name,
                                                        device="cpu",
                                                        task_type=task_type)
        model.save(output_path)
        os.remove(
            output_path / "language_model.bin"
        )  # remove the actual PyTorch model(only configs are required)

        onnx_model_config = {
            "task_type": task_type,
            "onnx_opset_version": opset_version,
            "language_model_class": language_model_class,
            "language": model.language_model.language
        }
        with open(output_path / "onnx_model_config.json", "w") as f:
            json.dump(onnx_model_config, f)

        if convert_to_float16:
            from onnxruntime_tools import optimizer
            config = AutoConfig.from_pretrained(model_name)
            optimized_model = optimizer.optimize_model(
                input=str(output_path / "model.onnx"),
                model_type='bert',
                num_heads=config.num_hidden_layers,
                hidden_size=config.hidden_size)
            optimized_model.convert_model_float32_to_float16()
            optimized_model.save_model_to_file("model.onnx")

        if quantize:
            quantize_model(output_path / "model.onnx")
Exemplo n.º 3
0
    def load(
        cls,
        model_name_or_path,
        revision=None,
        batch_size=4,
        gpu=False,
        task_type=None,
        return_class_probs=False,
        strict=True,
        max_seq_len=256,
        doc_stride=128,
        extraction_layer=None,
        extraction_strategy=None,
        s3e_stats=None,
        num_processes=None,
        disable_tqdm=False,
        tokenizer_class=None,
        use_fast=True,
        tokenizer_args=None,
        multithreading_rust=True,
        dummy_ph=False,
        benchmarking=False,
    ):
        """
        Load an Inferencer incl. all relevant components (model, tokenizer, processor ...) either by

        1. specifying a public name from transformers' model hub (https://huggingface.co/models)
        2. or pointing to a local directory it is saved in.

        :param model_name_or_path: Local directory or public name of the model to load.
        :type model_name_or_path: str
        :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
        :type revision: str
        :param batch_size: Number of samples computed once per batch
        :type batch_size: int
        :param gpu: If GPU shall be used
        :type gpu: bool
        :param task_type: Type of task the model should be used for. Currently supporting:
                          "embeddings", "question_answering", "text_classification", "ner". More coming soon...
        :param task_type: str
        :param strict: whether to strictly enforce that the keys loaded from saved model match the ones in
                       the PredictionHead (see torch.nn.module.load_state_dict()).
                       Set to `False` for backwards compatibility with PHs saved with older version of FARM.
        :type strict: bool
        :param max_seq_len: maximum length of one text sample
        :type max_seq_len: int
        :param doc_stride: Only QA: When input text is longer than max_seq_len it gets split into parts, strided by doc_stride
        :type doc_stride: int
        :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean'
                               (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors)
        :type extraction_strategy: str
        :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer).
        :type extraction_layer: int
        :param s3e_stats: Stats of a fitted S3E model as returned by `fit_s3e_on_corpus()`
                          (only needed for task_type="embeddings" and extraction_strategy = "s3e")
        :type s3e_stats: dict
        :param num_processes: the number of processes for `multiprocessing.Pool`. Set to value of 0 to disable
                              multiprocessing. Set to None to let Inferencer use all CPU cores minus one. If you want to
                              debug the Language Model, you might need to disable multiprocessing!
                              **Warning!** If you use multiprocessing you have to close the
                              `multiprocessing.Pool` again! To do so call
                              :func:`~farm.infer.Inferencer.close_multiprocessing_pool` after you are
                              done using this class. The garbage collector will not do this for you!
        :type num_processes: int
        :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing)
        :type disable_tqdm: bool
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param use_fast: (Optional, True by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
            use the Python one (False).
        :type use_fast: bool
        :param tokenizer_args: (Optional) Will be passed to the Tokenizer ``__init__`` method.
            See https://huggingface.co/transformers/main_classes/tokenizer.html and detailed tokenizer documentation
            on `Hugging Face Transformers <https://huggingface.co/transformers/>`_.
        :type tokenizer_args: dict
        :param multithreading_rust: Whether to allow multithreading in Rust, e.g. for FastTokenizers.
                                    Note: Enabling multithreading in Rust AND multiprocessing in python might cause
                                    deadlocks.
        :type multithreading_rust: bool
        :param dummy_ph: If True, methods of the prediction head will be replaced
                             with a dummy method. This is used to isolate lm run time from ph run time.
        :type dummy_ph: bool
        :param benchmarking: If True, a benchmarking object will be initialised within the class and
                             certain parts of the code will be timed for benchmarking. Should be kept
                             False if not benchmarking since these timing checkpoints require synchronization
                             of the asynchronous Pytorch operations and may slow down the model.
        :type benchmarking: bool
        :return: An instance of the Inferencer.

        """
        if tokenizer_args is None:
            tokenizer_args = {}

        device, n_gpu = initialize_device_settings(use_cuda=gpu,
                                                   local_rank=-1,
                                                   use_amp=None)
        name = os.path.basename(model_name_or_path)

        # a) either from local dir
        if os.path.exists(model_name_or_path):
            model = BaseAdaptiveModel.load(load_dir=model_name_or_path,
                                           device=device,
                                           strict=strict)
            if task_type == "embeddings":
                processor = InferenceProcessor.load_from_dir(
                    model_name_or_path)
            else:
                processor = Processor.load_from_dir(model_name_or_path)

        # b) or from remote transformers model hub
        else:
            if not task_type:
                raise ValueError(
                    "Please specify the 'task_type' of the model you want to load from transformers. "
                    "Valid options for arg `task_type`:"
                    "'question_answering', 'embeddings', 'text_classification', 'ner'"
                )

            model = AdaptiveModel.convert_from_transformers(
                model_name_or_path,
                revision=revision,
                device=device,
                task_type=task_type)
            processor = Processor.convert_from_transformers(
                model_name_or_path,
                revision=revision,
                task_type=task_type,
                max_seq_len=max_seq_len,
                doc_stride=doc_stride,
                tokenizer_class=tokenizer_class,
                tokenizer_args=tokenizer_args,
                use_fast=use_fast)

        # override processor attributes loaded from config or HF with inferencer params
        processor.max_seq_len = max_seq_len
        processor.multithreading_rust = multithreading_rust
        if hasattr(processor, "doc_stride"):
            assert doc_stride < max_seq_len, "doc_stride is longer than max_seq_len. This means that there will be gaps " \
                                             "as the passage windows slide, causing the model to skip over parts of the document. " \
                                             "Please set a lower value for doc_stride (Suggestions: doc_stride=128, max_seq_len=384) "
            processor.doc_stride = doc_stride

        return cls(model,
                   processor,
                   task_type=task_type,
                   batch_size=batch_size,
                   gpu=gpu,
                   name=name,
                   return_class_probs=return_class_probs,
                   extraction_strategy=extraction_strategy,
                   extraction_layer=extraction_layer,
                   s3e_stats=s3e_stats,
                   num_processes=num_processes,
                   disable_tqdm=disable_tqdm,
                   benchmarking=benchmarking,
                   dummy_ph=dummy_ph)
Exemplo n.º 4
0
dirname = os.path.dirname(__file__)

configFile = os.path.join(dirname,'config.yaml')

import yaml

config = None
with open(configFile) as file:
  config = yaml.safe_load(file)

sqlUrlFAQ = config["sqlUrlFAQ"]


model = AdaptiveModel.convert_from_transformers(
    "deepset/sentence_bert", device="cpu", task_type="embeddings")
processor = Processor.convert_from_transformers(
    "deepset/sentence_bert", task_type="embeddings", max_seq_len=384, doc_stride=128)
model.save("sentence_bert-saved")
processor.save("sentence_bert-saved")


document_store = FAISSDocumentStore(sql_url=sqlUrlFAQ)


# from haystack.retriever.dense import DensePassageRetriever
# retriever = DensePassageRetriever(document_store=document_store,
#                                   query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
#                                   passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
#                                   max_seq_len_query=64,
#                                   max_seq_len_passage=256,
#                                   batch_size=16,
#                                   use_gpu=True,