Exemplo n.º 1
0
def get_framework(model, revision: Optional[str] = None):
    """
    Select framework (TensorFlow or PyTorch) to use.

    Args:
        model (:obj:`str`, :class:`~transformers.PreTrainedModel` or :class:`~transformers.TFPreTrainedModel`):
            If both frameworks are installed, picks the one corresponding to the model passed (either a model class or
            the model name). If no specific model is provided, defaults to using PyTorch.
    """
    warnings.warn(
        "`get_framework` is deprecated and will be removed in v5, use `infer_framework_from_model` instead.",
        FutureWarning,
    )
    if not is_tf_available() and not is_torch_available():
        raise RuntimeError(
            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
            "To install PyTorch, read the instructions at https://pytorch.org/."
        )
    if isinstance(model, str):
        if is_torch_available() and not is_tf_available():
            model = AutoModel.from_pretrained(model, revision=revision)
        elif is_tf_available() and not is_torch_available():
            model = TFAutoModel.from_pretrained(model, revision=revision)
        else:
            try:
                model = AutoModel.from_pretrained(model, revision=revision)
            except OSError:
                model = TFAutoModel.from_pretrained(model, revision=revision)

    framework = "tf" if model.__class__.__name__.startswith("TF") else "pt"
    return framework
Exemplo n.º 2
0
def infer_framework_from_model(model,
                               model_classes: Optional[Dict[str, type]] = None,
                               revision: Optional[str] = None,
                               task: Optional[str] = None,
                               onnx: bool = True,
                               graph_path: Optional[Path] = None):
    """
    Select framework (TensorFlow or PyTorch) to use from the :obj:`model` passed. Returns a tuple (framework, model).

    If :obj:`model` is instantiated, this function will just infer the framework from the model class. Otherwise
    :obj:`model` is actually a checkpoint name and this method will try to instantiate it using :obj:`model_classes`.
    Since we don't want to instantiate the model twice, this model is returned for use by the pipeline.

    If both frameworks are installed and available for :obj:`model`, PyTorch is selected.

    Args:
        model (:obj:`str`, :class:`~transformers.PreTrainedModel` or :class:`~transformers.TFPreTrainedModel`):
            The model to infer the framework from. If :obj:`str`, a checkpoint name. The model to infer the framewrok
            from.
        model_classes (dictionary :obj:`str` to :obj:`type`, `optional`):
            A mapping framework to class.
        revision (:obj:`str`, `optional`):
            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
            git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
            identifier allowed by git.

    Returns:
        :obj:`Tuple`: A tuple framework, model.
    """
    if not is_tf_available() and not is_torch_available():
        raise RuntimeError(
            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
            "To install PyTorch, read the instructions at https://pytorch.org/."
        )
    if (onnx and not os.path.exists(graph_path)) or not onnx:
        if isinstance(model, str):
            if is_torch_available() and not is_tf_available():
                model_class = model_classes.get("pt", AutoModel)
                model = model_class.from_pretrained(model, revision=revision)
            elif is_tf_available() and not is_torch_available():
                model_class = model_classes.get("tf", TFAutoModel)
                model = model_class.from_pretrained(model, revision=revision)
            else:
                try:
                    model_class = model_classes.get("pt", AutoModel)
                    model = model_class.from_pretrained(model,
                                                        revision=revision)
                except OSError:
                    model_class = model_classes.get("tf", TFAutoModel)
                    model = model_class.from_pretrained(model,
                                                        revision=revision)
            framework = "tf" if model.__class__.__name__.startswith(
                "TF") else "pt"
    else:
        framework = "pt" if is_torch_available() else "tf"

    return framework, model
Exemplo n.º 3
0
class TFCLIPTextModelTest(TFModelTesterMixin, unittest.TestCase):

    all_model_classes = (TFCLIPTextModel,) if is_tf_available() else ()
    test_pruning = False
    test_head_masking = False
    test_onnx = False

    def setUp(self):
        self.model_tester = TFCLIPTextModelTester(self)
        self.config_tester = ConfigTester(self, config_class=CLIPTextConfig, hidden_size=37)

    def test_config(self):
        self.config_tester.run_common_tests()

    def test_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

    def test_inputs_embeds(self):
        # CLIP does not use inputs_embeds
        pass

    @slow
    def test_model_from_pretrained(self):
        for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            model = TFCLIPTextModel.from_pretrained(model_name)
            self.assertIsNotNone(model)
Exemplo n.º 4
0
def relevent_dialog_convert_examples_to_features(
    examples: Union[List[InputExample], "tf.data.Dataset"],
    tokenizer: PreTrainedTokenizer,
    max_length: Optional[int] = None,
    task=None,
    label_list=None,
    output_mode=None,
):
    """
    Loads a data file into a list of ``InputFeatures``
    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length. Defaults to the tokenizer's max_len
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.
    """
    if is_tf_available() and isinstance(examples, tf.data.Dataset):
        if task is None:
            raise ValueError("When calling relevent_dialog_convert_examples_to_features from TF, the task parameter is required.")
        return _tf_relevent_dialog_convert_examples_to_features(examples, tokenizer, max_length=max_length, task=task)
    return _relevent_dialog_convert_examples_to_features(
        examples, tokenizer, max_length=max_length, task=task, label_list=label_list, output_mode=output_mode
    )
Exemplo n.º 5
0
def get_default_model(targeted_task: Dict, framework: Optional[str],
                      task_options: Optional[Any]) -> str:
    """
    Select a default model to use for a given task. Defaults to pytorch if ambiguous.

    Args:
        targeted_task (:obj:`Dict` ):
           Dictionary representing the given task, that should contain default models

        framework (:obj:`str`, None)
           "pt", "tf" or None, representing a specific framework if it was specified, or None if we don't know yet.

        task_options (:obj:`Any`, None)
           Any further value required by the task to get fully specified, for instance (SRC, TGT) languages for
           translation task.

    Returns

        :obj:`str` The model string representing the default model for this pipeline
    """
    if is_torch_available() and not is_tf_available():
        framework = "pt"
    elif is_tf_available() and not is_torch_available():
        framework = "tf"

    defaults = targeted_task["default"]
    if task_options:
        if task_options not in defaults:
            raise ValueError(
                f"The task does not provide any default models for options {task_options}"
            )
        default_models = defaults[task_options]["model"]
    elif "model" in defaults:
        default_models = targeted_task["default"]["model"]
    else:
        # XXX This error message needs to be updated to be more generic if more tasks are going to become
        # parametrized
        raise ValueError(
            'The task defaults can\'t be correctly selected. You probably meant "translation_XX_to_YY"'
        )

    if framework is None:
        framework = "pt"

    return default_models[framework]
Exemplo n.º 6
0
def export(
    tokenizer: PreTrainedTokenizer,
    model: Union[PreTrainedModel, TFPreTrainedModel],
    config: OnnxConfig,
    opset: int,
    output: Path,
) -> Tuple[List[str], List[str]]:
    """
    Export a Pytorch or TensorFlow model to an ONNX Intermediate Representation (IR)

    Args:
        tokenizer ([`PreTrainedTokenizer`]):
            The tokenizer used for encoding the data.
        model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
            The model to export.
        config ([`~onnx.config.OnnxConfig`]):
            The ONNX configuration associated with the exported model.
        opset (`int`):
            The version of the ONNX operator set to use.
        output (`Path`):
            Directory to store the exported ONNX model.

    Returns:
        `Tuple[List[str], List[str]]`: A tuple with an ordered list of the model's inputs, and the named inputs from
        the ONNX configuration.
    """
    if not (is_torch_available() or is_tf_available()):
        raise ImportError(
            "Cannot convert because neither PyTorch nor TensorFlow are not installed. "
            "Please install torch or tensorflow first.")

    if is_torch_available():
        from transformers.file_utils import torch_version

        if not is_torch_onnx_dict_inputs_support_available():
            raise AssertionError(
                f"Unsupported PyTorch version, minimum required is 1.8.0, got: {torch_version}"
            )

    if is_torch_available() and issubclass(type(model), PreTrainedModel):
        return export_pytorch(tokenizer, model, config, opset, output)
    elif is_tf_available() and issubclass(type(model), TFPreTrainedModel):
        return export_tensorflow(tokenizer, model, config, opset, output)
Exemplo n.º 7
0
def require_retrieval(test_case):
    """
    Decorator marking a test that requires a set of dependencies necessary for pefrorm retrieval with
    :class:`~transformers.RagRetriever`.

    These tests are skipped when respective libraries are not installed.

    """
    if not (is_tf_available() and is_datasets_available() and is_faiss_available()):
        test_case = unittest.skip("test requires tensorflow, datasets and faiss")(test_case)
    return test_case
Exemplo n.º 8
0
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)

    if is_torch_available():
        torch.manual_seed(seed)
        #torch.cuda.manual_seed_all(seed)

    if is_tf_available():
        import tensorflow as tf
        tf.random.set_seed(seed)
def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
    """
    Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR)

    Args:
        nlp: The pipeline to be exported
        opset: The actual version of the ONNX operator set to use
        output: Path where will be stored the generated ONNX model

    Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow

    """
    if not is_tf_available():
        raise Exception(
            "Cannot convert because TF is not installed. Please install tensorflow first."
        )

    print(
        "/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\"
    )

    try:
        import tensorflow as tf

        import tf2onnx
        from tf2onnx import __version__ as t2ov

        print(
            f"Using framework TensorFlow: {tf.version.VERSION}, tf2onnx: {t2ov}"
        )

        # Build
        input_names, output_names, dynamic_axes, tokens = infer_shapes(
            nlp, "tf")

        # Forward
        nlp.model.predict(tokens.data)
        input_signature = [
            tf.TensorSpec.from_tensor(tensor, name=key)
            for key, tensor in tokens.items()
        ]
        model_proto, _ = tf2onnx.convert.from_keras(
            nlp.model,
            input_signature,
            opset=opset,
            output_path=output.as_posix())

    except ImportError as e:
        raise Exception(
            f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first. {e}"
        )
Exemplo n.º 10
0
def get_models(module):
    """ Get the objects in module that are models."""
    models = []
    model_classes = ()
    if is_torch_available():
        model_classes += (transformers.PreTrainedModel, )
    if is_tf_available():
        model_classes += (transformers.TFPreTrainedModel, )
    for attr_name in dir(module):
        if "Pretrained" in attr_name or "PreTrained" in attr_name:
            continue
        attr = getattr(module, attr_name)
        if isinstance(attr, type) and issubclass(
                attr, model_classes) and attr.__module__ == module.__name__:
            models.append((attr_name, attr))
    return models
Exemplo n.º 11
0
    def __init__(self, args: Namespace):
        self.logger = logging.get_logger("transformers-cli/training")

        self.framework = "tf" if is_tf_available() else "torch"

        os.makedirs(args.output, exist_ok=True)
        self.output = args.output

        self.column_label = args.column_label
        self.column_text = args.column_text
        self.column_id = args.column_id

        self.logger.info("Loading {} pipeline for {}".format(
            args.task, args.model))
        if args.task == "text_classification":
            self.pipeline = TextClassificationPipeline.from_pretrained(
                args.model)
        elif args.task == "token_classification":
            raise NotImplementedError
        elif args.task == "question_answering":
            raise NotImplementedError

        self.logger.info("Loading dataset from {}".format(args.train_data))
        self.train_dataset = Processor.create_from_csv(
            args.train_data,
            column_label=args.column_label,
            column_text=args.column_text,
            column_id=args.column_id,
            skip_first_row=args.skip_first_row,
        )
        self.valid_dataset = None
        if args.validation_data:
            self.logger.info("Loading validation dataset from {}".format(
                args.validation_data))
            self.valid_dataset = Processor.create_from_csv(
                args.validation_data,
                column_label=args.column_label,
                column_text=args.column_text,
                column_id=args.column_id,
                skip_first_row=args.skip_first_row,
            )

        self.validation_split = args.validation_split
        self.train_batch_size = args.train_batch_size
        self.valid_batch_size = args.valid_batch_size
        self.learning_rate = args.learning_rate
        self.adam_epsilon = args.adam_epsilon
Exemplo n.º 12
0
def convert_tensorflow(nlp: Pipeline, opset: int, output: Path):
    """
    Export a TensorFlow backed pipeline to ONNX Intermediate Representation (IR

    Args:
        nlp: The pipeline to be exported
        opset: The actual version of the ONNX operator set to use
        output: Path where will be stored the generated ONNX model

    Notes: TensorFlow cannot export model bigger than 2GB due to internal constraint from TensorFlow

    """
    if not is_tf_available():
        raise Exception(
            "Cannot convert because TF is not installed. Please install tensorflow first."
        )

    print(
        "/!\\ Please note TensorFlow doesn't support exporting model > 2Gb /!\\"
    )

    try:
        import tensorflow as tf

        from keras2onnx import __version__ as k2ov
        from keras2onnx import convert_keras, save_model

        print(
            f"Using framework TensorFlow: {tf.version.VERSION}, keras2onnx: {k2ov}"
        )

        # Build
        input_names, output_names, dynamic_axes, tokens = infer_shapes(
            nlp, "tf")

        # Forward
        nlp.model.predict(tokens.data)
        onnx_model = convert_keras(nlp.model,
                                   nlp.model.name,
                                   target_opset=opset)
        save_model(onnx_model, output.as_posix())

    except ImportError as e:
        raise Exception(
            f"Cannot import {e.name} required to convert TF model to ONNX. Please install {e.name} first."
        )
Exemplo n.º 13
0
def get_all_auto_configured_models():
    """ Return the list of all models in at least one auto class."""
    result = set(
    )  # To avoid duplicates we concatenate all model classes in a set.
    if is_torch_available():
        for attr_name in dir(transformers.modeling_auto):
            if attr_name.startswith("MODEL_") and attr_name.endswith(
                    "MAPPING"):
                result = result | set(
                    getattr(transformers.modeling_auto, attr_name).values())
    if is_tf_available():
        for attr_name in dir(transformers.modeling_tf_auto):
            if attr_name.startswith("TF_MODEL_") and attr_name.endswith(
                    "MAPPING"):
                result = result | set(
                    getattr(transformers.modeling_tf_auto, attr_name).values())
    return [cls.__name__ for cls in result]
Exemplo n.º 14
0
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)
    if is_torch_available():
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # safe to call this function even if cuda is not available
    if is_tf_available():
        import tensorflow as tf
        tf.random.set_seed(seed)
def load_graph_from_args(pipeline_name: str,
                         framework: str,
                         model: str,
                         tokenizer: Optional[str] = None,
                         **models_kwargs) -> Pipeline:
    """
    Convert the set of arguments provided through the CLI to an actual pipeline reference (tokenizer + model

    Args:
        pipeline_name: The kind of pipeline to use (ner, question-answering, etc.)
        framework: The actual model to convert the pipeline from ("pt" or "tf")
        model: The model name which will be loaded by the pipeline
        tokenizer: The tokenizer name which will be loaded by the pipeline, default to the model's value

    Returns: Pipeline object

    """
    # If no tokenizer provided
    if tokenizer is None:
        tokenizer = model

    # Check the wanted framework is available
    if framework == "pt" and not is_torch_available():
        raise Exception(
            "Cannot convert because PyTorch is not installed. Please install torch first."
        )
    if framework == "tf" and not is_tf_available():
        raise Exception(
            "Cannot convert because TF is not installed. Please install tensorflow first."
        )

    print(f"Loading pipeline (model: {model}, tokenizer: {tokenizer})")

    # Allocate tokenizer and model
    return pipeline(pipeline_name,
                    model=model,
                    tokenizer=tokenizer,
                    framework=framework,
                    model_kwargs=models_kwargs)
Exemplo n.º 16
0
def convert_examples_to_features(examples, tokenizer,
                                      max_length=512,
                                      label_list=None,
                                      pad_on_left=False,
                                      pad_token=0,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True,
                                      sample_negatives=False):
    """
    Loads a data file into a list of ``InputFeatures``

    Args:
        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)

    Returns:
        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.

    """
    is_tf_dataset = False
    if is_tf_available() and isinstance(examples, tf.data.Dataset):
        is_tf_dataset = True

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    if examples[0].text_b is not None:
        k = len(examples[0].text_b)
    if sample_negatives:
        neg_indices = [np.random.choice(len(examples), size=len(examples), replace=False) for i in range(k)]
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d" % (ex_index))
        if is_tf_dataset:
            example = processor.get_example_from_tensor_dict(example)

        if type(example.text_a) is list:
            text_a = example.text_a
            text_b = [example.text_b]*len(text_a)
        elif type(example.text_b) is list:
            text_b = example.text_b
            if sample_negatives:
                label_idx = label_map[example.label]
                text_b_neg = [(examples[neg_indices[i][ex_index]]).text_b[label_idx] for i in range(k)]
                text_b_neg[label_idx] = text_b[label_idx]

            text_a = [example.text_a]*len(text_b)
        else:
            text_a = [example.text_a]
            text_b = [example.text_b]

        if 0: #sample_negatives:
            print ('Created negative samples')
            print ('Original example: label:{} text_a: {} text_b1: {}, 2: {}, 3:{}'.format(example.label, text_a[0], text_b[0], text_b[1], text_b[2])) 
            print ('Converted example: text_a: {} text_b1: {}, 2: {}, 3:{}'.format(text_a[0], text_b_neg[0], text_b_neg[1], text_b_neg[2])) 

        def get_indices(t1, t2):
            out = []
            for a,b in zip(t1, t2):
                inputs = tokenizer.encode_plus(
                    a,
                    b,
                    add_special_tokens=True,
                    max_length=max_length,
                )
                input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]

                # The mask has 1 for real tokens and 0 for padding tokens. Only real
                # tokens are attended to.
                attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

                # Zero-pad up to the sequence length.
                padding_length = max_length - len(input_ids)
                if pad_on_left:
                    input_ids = ([pad_token] * padding_length) + input_ids
                    attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
                    token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
                else:
                    input_ids = input_ids + ([pad_token] * padding_length)
                    attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
                    token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)

                assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
                assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length)
                assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length)
                out.append((input_ids, attention_mask, token_type_ids))

            if len(t1) == 1:
                input_ids, attention_mask, token_type_ids = out[0]
            else:
                input_ids, attention_mask, token_type_ids = zip(*out)
            return input_ids, attention_mask, token_type_ids
        
        input_ids, attention_mask, token_type_ids = get_indices(text_a, text_b)
        if sample_negatives:
            input_ids_n, attention_mask_n, token_type_ids_n = get_indices(text_a, text_b_neg)

        label = label_map[example.label]

        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label))

        features.append(
                InputFeatures(input_ids=input_ids,
                              attention_mask=attention_mask,
                              token_type_ids=token_type_ids,
                              label=label))

        if sample_negatives:
            features.append(
                    InputFeatures(input_ids=input_ids_n,
                                  attention_mask=attention_mask_n,
                                  token_type_ids=token_type_ids_n,
                                  label=label))

    if is_tf_available() and is_tf_dataset:
        def gen():
            for ex in features:
                yield  ({'input_ids': ex.input_ids,
                         'attention_mask': ex.attention_mask,
                         'token_type_ids': ex.token_type_ids},
                        ex.label)

        return tf.data.Dataset.from_generator(gen,
            ({'input_ids': tf.int32,
              'attention_mask': tf.int32,
              'token_type_ids': tf.int32},
             tf.int64),
            ({'input_ids': tf.TensorShape([None]),
              'attention_mask': tf.TensorShape([None]),
              'token_type_ids': tf.TensorShape([None])},
             tf.TensorShape([])))

    return features
Exemplo n.º 17
0
class TFViTMAEModelTest(TFModelTesterMixin, unittest.TestCase):
    """
    Here we also overwrite some of the tests of test_modeling_common.py, as ViTMAE does not use input_ids, inputs_embeds,
    attention_mask and seq_length.
    """

    all_model_classes = (TFViTMAEModel, TFViTMAEForPreTraining) if is_tf_available() else ()

    test_pruning = False
    test_onnx = False
    test_resize_embeddings = False
    test_head_masking = False

    def setUp(self):
        self.model_tester = TFViTMAEModelTester(self)
        self.config_tester = ConfigTester(self, config_class=ViTMAEConfig, has_text_modality=False, hidden_size=37)

    def test_config(self):
        self.config_tester.run_common_tests()

    @unittest.skip(reason="ViTMAE does not use inputs_embeds")
    def test_inputs_embeds(self):
        # ViTMAE does not use inputs_embeds
        pass

    def test_model_common_attributes(self):
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            model = model_class(config)
            self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer))
            x = model.get_output_embeddings()
            self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer))

    def test_forward_signature(self):
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            model = model_class(config)
            signature = inspect.signature(model.call)
            # signature.parameters is an OrderedDict => so arg_names order is deterministic
            arg_names = [*signature.parameters.keys()]

            expected_arg_names = ["pixel_values"]
            self.assertListEqual(arg_names[:1], expected_arg_names)

    def test_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

    def test_for_pretraining(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_for_pretraining(*config_and_inputs)

    # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise
    # to generate masks during test
    def test_keyword_and_dict_args(self):
        # make the mask reproducible
        np.random.seed(2)

        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        num_patches = int((config.image_size // config.patch_size) ** 2)
        noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))

        for model_class in self.all_model_classes:
            model = model_class(config)
            inputs = self._prepare_for_class(inputs_dict, model_class)

            outputs_dict = model(inputs, noise=noise)

            inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
            outputs_keywords = model(**inputs_keywords, noise=noise)
            output_dict = outputs_dict[0].numpy()
            output_keywords = outputs_keywords[0].numpy()

            self.assertLess(np.sum(np.abs(output_dict - output_keywords)), 1e-6)

    # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise
    # to generate masks during test
    def test_numpy_arrays_inputs(self):
        # make the mask reproducible
        np.random.seed(2)

        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        num_patches = int((config.image_size // config.patch_size) ** 2)
        noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))

        def prepare_numpy_arrays(inputs_dict):
            inputs_np_dict = {}
            for k, v in inputs_dict.items():
                if tf.is_tensor(v):
                    inputs_np_dict[k] = v.numpy()
                else:
                    inputs_np_dict[k] = np.array(k)

            return inputs_np_dict

        for model_class in self.all_model_classes:
            model = model_class(config)

            inputs = self._prepare_for_class(inputs_dict, model_class)
            inputs_np = prepare_numpy_arrays(inputs)

            output_for_dict_input = model(inputs_np, noise=noise)
            output_for_kw_input = model(**inputs_np, noise=noise)
            self.assert_outputs_same(output_for_dict_input, output_for_kw_input)

    def test_attention_outputs(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        config.return_dict = True

        # in ViTMAE, the seq_len equals (number of patches + 1) * (1 - mask_ratio), rounded above
        image_size = to_2tuple(self.model_tester.image_size)
        patch_size = to_2tuple(self.model_tester.patch_size)
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        seq_len = int(math.ceil((1 - config.mask_ratio) * (num_patches + 1)))
        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
        chunk_length = getattr(self.model_tester, "chunk_length", None)
        if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes

        for model_class in self.all_model_classes:
            inputs_dict["output_attentions"] = True
            inputs_dict["output_hidden_states"] = False
            config.return_dict = True
            model = model_class(config)
            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)

            # check that output_attentions also work using config
            del inputs_dict["output_attentions"]
            config.output_attentions = True
            model = model_class(config)
            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)

            if chunk_length is not None:
                self.assertListEqual(
                    list(attentions[0].shape[-4:]),
                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
                )
            else:
                self.assertListEqual(
                    list(attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
                )
            out_len = len(outputs)

            # Check attention is always last and order is fine
            inputs_dict["output_attentions"] = True
            inputs_dict["output_hidden_states"] = True
            model = model_class(config)
            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)

            if hasattr(self.model_tester, "num_hidden_states_types"):
                added_hidden_states = self.model_tester.num_hidden_states_types
            elif self.is_encoder_decoder:
                added_hidden_states = 2
            else:
                added_hidden_states = 1
            self.assertEqual(out_len + added_hidden_states, len(outputs))

            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions

            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
            if chunk_length is not None:
                self.assertListEqual(
                    list(self_attentions[0].shape[-4:]),
                    [self.model_tester.num_attention_heads, encoder_seq_length, chunk_length, encoder_key_length],
                )
            else:
                self.assertListEqual(
                    list(self_attentions[0].shape[-3:]),
                    [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
                )

    def test_hidden_states_output(self):
        def check_hidden_states_output(inputs_dict, config, model_class):
            model = model_class(config)

            outputs = model(**self._prepare_for_class(inputs_dict, model_class))

            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states

            expected_num_layers = getattr(
                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
            )
            self.assertEqual(len(hidden_states), expected_num_layers)

            # ViTMAE has a different seq_length
            image_size = to_2tuple(self.model_tester.image_size)
            patch_size = to_2tuple(self.model_tester.patch_size)
            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
            seq_length = int(math.ceil((1 - config.mask_ratio) * (num_patches + 1)))

            self.assertListEqual(
                list(hidden_states[0].shape[-2:]),
                [seq_length, self.model_tester.hidden_size],
            )

        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            inputs_dict["output_hidden_states"] = True
            check_hidden_states_output(inputs_dict, config, model_class)

            # check that output_hidden_states also work using config
            del inputs_dict["output_hidden_states"]
            config.output_hidden_states = True

            check_hidden_states_output(inputs_dict, config, model_class)

    # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise
    # to generate masks during test
    def check_pt_tf_models(self, tf_model, pt_model, tf_inputs_dict):

        # make masks reproducible
        np.random.seed(2)

        num_patches = int((tf_model.config.image_size // tf_model.config.patch_size) ** 2)
        noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))
        tf_noise = tf.constant(noise)

        # Add `noise` argument.
        # PT inputs will be prepared in `super().check_pt_tf_models()` with this added `noise` argument
        tf_inputs_dict["noise"] = tf_noise

        super().check_pt_tf_models(tf_model, pt_model, tf_inputs_dict)

    # overwrite from common since TFViTMAEForPretraining outputs loss along with
    # logits and mask indices. loss and mask indicies are not suitable for integration
    # with other keras modules.
    def test_compile_tf_model(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")

        for model_class in self.all_model_classes:
            # `pixel_values` implies that the input is an image
            inputs = tf.keras.Input(
                batch_shape=(
                    3,
                    self.model_tester.num_channels,
                    self.model_tester.image_size,
                    self.model_tester.image_size,
                ),
                name="pixel_values",
                dtype="float32",
            )

            # Prepare our model
            model = model_class(config)
            model(self._prepare_for_class(inputs_dict, model_class))  # Model must be called before saving.
            # Let's load it from the disk to be sure we can use pretrained weights
            with tempfile.TemporaryDirectory() as tmpdirname:
                model.save_pretrained(tmpdirname, saved_model=False)
                model = model_class.from_pretrained(tmpdirname)

            outputs_dict = model(inputs)
            hidden_states = outputs_dict[0]

            # `TFViTMAEForPreTraining` outputs are not recommended to be used for
            #  downstream application. This is just to check if the outputs of
            # `TFViTMAEForPreTraining` can be integrated with other keras modules.
            if model_class.__name__ == "TFViTMAEForPreTraining":
                hidden_states = outputs_dict["logits"]

            # Add a dense layer on top to test integration with other keras modules
            outputs = tf.keras.layers.Dense(2, activation="softmax", name="outputs")(hidden_states)

            # Compile extended model
            extended_model = tf.keras.Model(inputs=[inputs], outputs=[outputs])
            extended_model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise
    # to generate masks during test
    def test_keras_save_load(self):
        # make mask reproducible
        np.random.seed(2)

        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        tf_main_layer_classes = set(
            module_member
            for model_class in self.all_model_classes
            for module in (import_module(model_class.__module__),)
            for module_member_name in dir(module)
            if module_member_name.endswith("MainLayer")
            # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`.
            and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")]
            for module_member in (getattr(module, module_member_name),)
            if isinstance(module_member, type)
            and tf.keras.layers.Layer in module_member.__bases__
            and getattr(module_member, "_keras_serializable", False)
        )

        num_patches = int((config.image_size // config.patch_size) ** 2)
        noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))
        noise = tf.convert_to_tensor(noise)
        inputs_dict.update({"noise": noise})

        for main_layer_class in tf_main_layer_classes:
            main_layer = main_layer_class(config)

            symbolic_inputs = {
                name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
            }

            model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
            outputs = model(inputs_dict)

            with tempfile.TemporaryDirectory() as tmpdirname:
                filepath = os.path.join(tmpdirname, "keras_model.h5")
                model.save(filepath)
                model = tf.keras.models.load_model(
                    filepath, custom_objects={main_layer_class.__name__: main_layer_class}
                )
                assert isinstance(model, tf.keras.Model)
                after_outputs = model(inputs_dict)
                self.assert_outputs_same(after_outputs, outputs)

    # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise
    # to generate masks during test
    def test_save_load(self):
        # make mask reproducible
        np.random.seed(2)

        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        num_patches = int((config.image_size // config.patch_size) ** 2)
        noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))

        for model_class in self.all_model_classes:
            model = model_class(config)
            model_input = self._prepare_for_class(inputs_dict, model_class)
            outputs = model(model_input, noise=noise)

            if model_class.__name__ == "TFViTMAEModel":
                out_2 = outputs.last_hidden_state.numpy()
                out_2[np.isnan(out_2)] = 0
            else:
                out_2 = outputs.logits.numpy()
                out_2[np.isnan(out_2)] = 0

            with tempfile.TemporaryDirectory() as tmpdirname:
                model.save_pretrained(tmpdirname, saved_model=True)
                saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
                model = tf.keras.models.load_model(saved_model_dir)
                after_outputs = model(model_input, noise=noise)

                if model_class.__name__ == "TFViTMAEModel":
                    out_1 = after_outputs["last_hidden_state"].numpy()
                    out_1[np.isnan(out_1)] = 0
                else:
                    out_1 = after_outputs["logits"].numpy()
                    out_1[np.isnan(out_1)] = 0

                max_diff = np.amax(np.abs(out_1 - out_2))
                self.assertLessEqual(max_diff, 1e-5)

    # overwrite from common since TFViTMAEForPretraining has random masking, we need to fix the noise
    # to generate masks during test
    def test_save_load_config(self):
        # make mask reproducible
        np.random.seed(2)

        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        num_patches = int((config.image_size // config.patch_size) ** 2)
        noise = np.random.uniform(size=(self.model_tester.batch_size, num_patches))

        for model_class in self.all_model_classes:
            model = model_class(config)
            model_inputs = self._prepare_for_class(inputs_dict, model_class)

            outputs = model(model_inputs, noise=noise)
            model_config = model.get_config()
            # make sure that returned config is jsonifiable, which is required by keras
            json.dumps(model_config)
            new_model = model_class.from_config(model.get_config())
            # make sure it also accepts a normal config
            _ = model_class.from_config(model.config)
            _ = new_model(model_inputs)  # Build model
            new_model.set_weights(model.get_weights())
            after_outputs = new_model(model_inputs, noise=noise)

            self.assert_outputs_same(after_outputs, outputs)

    @unittest.skip(
        reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load
    to get deterministic results."""
    )
    def test_determinism(self):
        pass

    @unittest.skip(reason="""ViTMAE returns a random mask + ids_restore in each forward pass. See test_save_load""")
    def test_model_outputs_equivalence(self):
        pass

    @slow
    def test_model_from_pretrained(self):

        model = TFViTMAEModel.from_pretrained("google/vit-base-patch16-224")
        self.assertIsNotNone(model)
Exemplo n.º 18
0
def glue_convert_examples_to_features(
    examples,
    tokenizer,
    max_length=512,
    task=None,
    label_list=None,
    output_mode=None,
    pad_on_left=False,
    pad_token=0,
    pad_token_segment_id=0,
    mask_padding_with_zero=True,
    load_id=False,
    load_element_id=False,
):
    """
    Loads a data_utils file into a list of ``InputFeatures``

    Args:
        examples: List of ``InputExamples`` or ``tf.data_utils.Dataset`` containing the examples.
        tokenizer: Instance of a tokenizer that will tokenize the examples
        max_length: Maximum example length
        task: GLUE task
        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
        pad_token: Padding token
        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
            actual values)
        load_id: add id for each example

    Returns:
        If the ``examples`` input is a ``tf.data_utils.Dataset``, will return a ``tf.data_utils.Dataset``
        containing the task-specific features. If the input is a list of ``InputExamples``, will return
        a list of task-specific ``InputFeatures`` which can be fed to the model.

    """
    is_tf_dataset = False
    if is_tf_available() and isinstance(examples, tf.data.Dataset):
        is_tf_dataset = True
    # logger.info('load_id' + load_id)
    # logger.info('is_tf_available()' +  is_tf_available())
    # logger.info('is_tf_dataset' + is_tf_dataset)


    if task is not None:
        processor = glue_processors[task]()
        if label_list is None:
            label_list = processor.get_labels()
            logger.info("Using label list %s for task %s" % (label_list, task))
        if output_mode is None:
            output_mode = glue_output_modes[task]
            logger.info("Using output mode %s for task %s" % (output_mode, task))

    label_map = {label: i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        if ex_index % 10000 == 0:
            logger.info("Writing example %d" % (ex_index))
        if is_tf_dataset:
            example = processor.get_example_from_tensor_dict(example)
            example = processor.tfds_map(example)

        inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length,
                                       return_element_type_ids=load_element_id)
        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
        if load_element_id:
            element_type_ids = inputs["element_type_ids"]

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_length - len(input_ids)
        if pad_on_left:
            input_ids = ([pad_token] * padding_length) + input_ids
            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
            if load_element_id:
                element_type_ids = ([pad_token_segment_id] * padding_length) + element_type_ids
        else:
            input_ids = input_ids + ([pad_token] * padding_length)
            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
            if load_element_id:
                element_type_ids = element_type_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(
            len(attention_mask), max_length
        )
        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(
            len(token_type_ids), max_length
        )
        if load_element_id:
            assert len(element_type_ids) == max_length, "Error with input length {} vs {}".format(
                len(element_type_ids), max_length
            )

        if output_mode == "classification":
            label = label_map[example.label]
        elif output_mode == "regression":
            label = float(example.label)
        else:
            raise KeyError(output_mode)

        if ex_index < 6:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
            if load_element_id:
                logger.info("element_type_ids: %s" % " ".join([str(x) for x in element_type_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label))

        if load_id:
            if load_element_id:
                features.append(
                    InputFeatures(
                        input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label,
                        example_id=example.guid, element_type_ids=element_type_ids
                    )
                )
            else:
                features.append(
                    InputFeatures(
                        input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label,
                        example_id=example.guid
                    )
                )
        else:
            if load_element_id:
                features.append(
                    InputFeatures(
                        input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label,
                        element_type_ids=element_type_ids
                    )
                )
            else:
                features.append(
                    InputFeatures(
                        input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label
                    )
                )


    if is_tf_available() and is_tf_dataset:

        def gen():
            for ex in features:
                yield (
                    {
                        "input_ids": ex.input_ids,
                        "attention_mask": ex.attention_mask,
                        "token_type_ids": ex.token_type_ids,
                    },
                    ex.label,
                )

        return tf.data.Dataset.from_generator(
            gen,
            ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
            (
                {
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None]),
                    "token_type_ids": tf.TensorShape([None]),
                },
                tf.TensorShape([]),
            ),
        )


    return features
Exemplo n.º 19
0
class TFCLIPVisionModelTest(TFModelTesterMixin, unittest.TestCase):
    """
    Here we also overwrite some of the tests of test_modeling_common.py, as CLIP does not use input_ids, inputs_embeds,
    attention_mask and seq_length.
    """

    all_model_classes = (TFCLIPVisionModel,) if is_tf_available() else ()

    test_pruning = False
    test_resize_embeddings = False
    test_head_masking = False
    test_onnx = False

    def setUp(self):
        self.model_tester = TFCLIPVisionModelTester(self)
        self.config_tester = ConfigTester(self, config_class=CLIPVisionConfig, has_text_modality=False, hidden_size=37)

    def test_config(self):
        self.config_tester.run_common_tests()

    def test_inputs_embeds(self):
        # CLIP does not use inputs_embeds
        pass

    def test_graph_mode_with_inputs_embeds(self):
        # CLIP does not use inputs_embeds
        pass

    def test_model_common_attributes(self):
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            model = model_class(config)
            self.assertIsInstance(model.get_input_embeddings(), (tf.keras.layers.Layer))
            x = model.get_output_embeddings()
            self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer))

    def test_forward_signature(self):
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            model = model_class(config)
            signature = inspect.signature(model.call)
            # signature.parameters is an OrderedDict => so arg_names order is deterministic
            arg_names = [*signature.parameters.keys()]

            expected_arg_names = ["pixel_values"]
            self.assertListEqual(arg_names[:1], expected_arg_names)

    def test_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

    def test_attention_outputs(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
        config.return_dict = True

        # in CLIP, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
        image_size = (self.model_tester.image_size, self.model_tester.image_size)
        patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
        seq_len = num_patches + 1

        for model_class in self.all_model_classes:
            inputs_dict["output_attentions"] = True
            inputs_dict["output_hidden_states"] = False
            config.return_dict = True
            model = model_class(config)
            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
            attentions = outputs.attentions
            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)

            # check that output_attentions also work using config
            del inputs_dict["output_attentions"]
            config.output_attentions = True
            model = model_class(config)
            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)
            attentions = outputs.attentions
            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)

            out_len = len(outputs)

            # Check attention is always last and order is fine
            inputs_dict["output_attentions"] = True
            inputs_dict["output_hidden_states"] = True
            model = model_class(config)
            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)

            added_hidden_states = 1
            self.assertEqual(out_len + added_hidden_states, len(outputs))

            self_attentions = outputs.attentions

            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)

            self.assertListEqual(
                list(self_attentions[0].shape[-3:]),
                [self.model_tester.num_attention_heads, seq_len, seq_len],
            )

    def test_hidden_states_output(self):
        def check_hidden_states_output(inputs_dict, config, model_class):
            model = model_class(config)

            outputs = model(**self._prepare_for_class(inputs_dict, model_class), training=False)

            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states

            expected_num_layers = getattr(
                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
            )
            self.assertEqual(len(hidden_states), expected_num_layers)

            # CLIP has a different seq_length
            image_size = (self.model_tester.image_size, self.model_tester.image_size)
            patch_size = (self.model_tester.patch_size, self.model_tester.patch_size)
            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
            seq_length = num_patches + 1

            self.assertListEqual(
                list(hidden_states[0].shape[-2:]),
                [seq_length, self.model_tester.hidden_size],
            )

        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            inputs_dict["output_hidden_states"] = True
            check_hidden_states_output(inputs_dict, config, model_class)

            # check that output_hidden_states also work using config
            del inputs_dict["output_hidden_states"]
            config.output_hidden_states = True

            check_hidden_states_output(inputs_dict, config, model_class)

    @slow
    def test_model_from_pretrained(self):
        for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            model = TFCLIPVisionModel.from_pretrained(model_name)
            self.assertIsNotNone(model)
Exemplo n.º 20
0
    get_default_model,
    infer_framework_from_model,
)
from .conversational import Conversation, ConversationalPipeline
from .feature_extraction import FeatureExtractionPipeline
from .fill_mask import FillMaskPipeline
from .question_answering import QuestionAnsweringArgumentHandler, QuestionAnsweringPipeline
from .table_question_answering import TableQuestionAnsweringArgumentHandler, TableQuestionAnsweringPipeline
from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
from .text_classification import TextClassificationPipeline
from .text_generation import TextGenerationPipeline
from .token_classification import NerPipeline, TokenClassificationArgumentHandler, TokenClassificationPipeline
from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline


if is_tf_available():
    import tensorflow as tf

    from transformers.models.auto.modeling_tf_auto import (
        TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
        TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
        TF_MODEL_WITH_LM_HEAD_MAPPING,
        TFAutoModel,
        TFAutoModelForCausalLM,
        TFAutoModelForMaskedLM,
        TFAutoModelForQuestionAnswering,
        TFAutoModelForSeq2SeqLM,
        TFAutoModelForSequenceClassification,
        TFAutoModelForTokenClassification,
Exemplo n.º 21
0
class TFCLIPModelTest(TFModelTesterMixin, unittest.TestCase):
    all_model_classes = (TFCLIPModel,) if is_tf_available() else ()
    test_head_masking = False
    test_pruning = False
    test_resize_embeddings = False
    test_attention_outputs = False
    test_onnx = False

    def setUp(self):
        self.model_tester = TFCLIPModelTester(self)

    def test_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

    # hidden_states are tested in individual model tests
    def test_hidden_states_output(self):
        pass

    # input_embeds are tested in individual model tests
    def test_inputs_embeds(self):
        pass

    # CLIPModel does not have input/output embeddings
    def test_model_common_attributes(self):
        pass

    # overwrite from common since `TFCLIPModelTester` set `return_loss` to `True` and causes the preparation of
    # `symbolic_inputs` failed.
    def test_keras_save_load(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        # remove `return_loss` to make code work
        if self.__class__.__name__ == "TFCLIPModelTest":
            inputs_dict.pop("return_loss", None)

        tf_main_layer_classes = set(
            module_member
            for model_class in self.all_model_classes
            for module in (import_module(model_class.__module__),)
            for module_member_name in dir(module)
            if module_member_name.endswith("MainLayer")
            # This condition is required, since `modeling_tf_clip.py` has 3 classes whose names end with `MainLayer`.
            and module_member_name[: -len("MainLayer")] == model_class.__name__[: -len("Model")]
            for module_member in (getattr(module, module_member_name),)
            if isinstance(module_member, type)
            and tf.keras.layers.Layer in module_member.__bases__
            and getattr(module_member, "_keras_serializable", False)
        )
        for main_layer_class in tf_main_layer_classes:
            # T5MainLayer needs an embed_tokens parameter when called without the inputs_embeds parameter
            if "T5" in main_layer_class.__name__:
                # Take the same values than in TFT5ModelTester for this shared layer
                shared = TFSharedEmbeddings(99, 32, name="shared")
                config.use_cache = inputs_dict.pop("use_cache", None)
                main_layer = main_layer_class(config, embed_tokens=shared)
            else:
                main_layer = main_layer_class(config)

            symbolic_inputs = {
                name: tf.keras.Input(tensor.shape[1:], dtype=tensor.dtype) for name, tensor in inputs_dict.items()
            }

            model = tf.keras.Model(symbolic_inputs, outputs=main_layer(symbolic_inputs))
            outputs = model(inputs_dict)

            with tempfile.TemporaryDirectory() as tmpdirname:
                filepath = os.path.join(tmpdirname, "keras_model.h5")
                model.save(filepath)
                if "T5" in main_layer_class.__name__:
                    model = tf.keras.models.load_model(
                        filepath,
                        custom_objects={
                            main_layer_class.__name__: main_layer_class,
                            "TFSharedEmbeddings": TFSharedEmbeddings,
                        },
                    )
                else:
                    model = tf.keras.models.load_model(
                        filepath, custom_objects={main_layer_class.__name__: main_layer_class}
                    )
                assert isinstance(model, tf.keras.Model)
                after_outputs = model(inputs_dict)
                self.assert_outputs_same(after_outputs, outputs)

    # overwrite from common since CLIPModel/TFCLIPModel return CLIPOutput/TFCLIPOutput
    @is_pt_tf_cross_test
    def test_pt_tf_model_equivalence(self):
        import torch

        import transformers

        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            pt_model_class_name = model_class.__name__[2:]  # Skip the "TF" at the beginning
            pt_model_class = getattr(transformers, pt_model_class_name)

            config.output_hidden_states = True

            tf_model = model_class(config)
            pt_model = pt_model_class(config)

            # Check we can load pt model in tf and vice-versa with model => model functions

            tf_model = transformers.load_pytorch_model_in_tf2_model(
                tf_model, pt_model, tf_inputs=self._prepare_for_class(inputs_dict, model_class)
            )
            pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)

            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
            pt_model.eval()
            pt_inputs_dict = {}
            for name, key in self._prepare_for_class(inputs_dict, model_class).items():
                if type(key) == bool:
                    pt_inputs_dict[name] = key
                elif name == "input_values":
                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
                elif name == "pixel_values":
                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
                else:
                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)

            # need to rename encoder-decoder "inputs" for PyTorch
            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
                pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")

            with torch.no_grad():
                pto = pt_model(**pt_inputs_dict)
            tfo = tf_model(self._prepare_for_class(inputs_dict, model_class), training=False)

            self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch")
            for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()):

                if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)):
                    continue

                tf_out = tf_output.numpy()
                pt_out = pt_output.numpy()

                self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch")

                if len(tf_out.shape) > 0:

                    tf_nans = np.copy(np.isnan(tf_out))
                    pt_nans = np.copy(np.isnan(pt_out))

                    pt_out[tf_nans] = 0
                    tf_out[tf_nans] = 0
                    pt_out[pt_nans] = 0
                    tf_out[pt_nans] = 0

                max_diff = np.amax(np.abs(tf_out - pt_out))
                self.assertLessEqual(max_diff, 4e-2)

            # Check we can load pt model in tf and vice-versa with checkpoint => model functions
            with tempfile.TemporaryDirectory() as tmpdirname:
                pt_checkpoint_path = os.path.join(tmpdirname, "pt_model.bin")
                torch.save(pt_model.state_dict(), pt_checkpoint_path)
                tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)

                tf_checkpoint_path = os.path.join(tmpdirname, "tf_model.h5")
                tf_model.save_weights(tf_checkpoint_path)
                pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)

            # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
            pt_model.eval()
            pt_inputs_dict = {}
            for name, key in self._prepare_for_class(inputs_dict, model_class).items():
                if type(key) == bool:
                    key = np.array(key, dtype=bool)
                    pt_inputs_dict[name] = torch.from_numpy(key).to(torch.long)
                elif name == "input_values":
                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
                elif name == "pixel_values":
                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.float32)
                else:
                    pt_inputs_dict[name] = torch.from_numpy(key.numpy()).to(torch.long)
            # need to rename encoder-decoder "inputs" for PyTorch
            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
                pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")

            with torch.no_grad():
                pto = pt_model(**pt_inputs_dict)
            tfo = tf_model(self._prepare_for_class(inputs_dict, model_class))

            self.assertEqual(len(tfo), len(pto), "Output lengths differ between TF and PyTorch")
            for tf_output, pt_output in zip(tfo.to_tuple(), pto.to_tuple()):

                if not (isinstance(tf_output, tf.Tensor) and isinstance(pt_output, torch.Tensor)):
                    continue

                tf_out = tf_output.numpy()
                pt_out = pt_output.numpy()

                self.assertEqual(tf_out.shape, pt_out.shape, "Output component shapes differ between TF and PyTorch")

                if len(tf_out.shape) > 0:
                    tf_nans = np.copy(np.isnan(tf_out))
                    pt_nans = np.copy(np.isnan(pt_out))

                    pt_out[tf_nans] = 0
                    tf_out[tf_nans] = 0
                    pt_out[pt_nans] = 0
                    tf_out[pt_nans] = 0

                max_diff = np.amax(np.abs(tf_out - pt_out))
                self.assertLessEqual(max_diff, 4e-2)

    @slow
    def test_model_from_pretrained(self):
        for model_name in TF_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            model = TFCLIPModel.from_pretrained(model_name)
            self.assertIsNotNone(model)
Exemplo n.º 22
0
def squad_convert_examples_to_features(
        examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False, threads=1
):
    """
    Converts a list of examples into a list of features that can be directly given as input to a model.
    It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.

    Args:
        examples: list of :class:`~transformers.data.processors.squad.SquadExample`
        tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
        max_seq_length: The maximum sequence length of the inputs.
        doc_stride: The stride used when the context is too large and is split across several features.
        max_query_length: The maximum length of the query.
        is_training: whether to create features for model evaluation or model training.
        return_dataset: Default False. Either 'pt' or 'tf'.
            if 'pt': returns a torch.data.TensorDataset,
            if 'tf': returns a tf.data.Dataset
        threads: multiple processing threadsa-smi


    Returns:
        list of :class:`~transformers.data.processors.squad.SquadFeatures`

    Example::

        processor = SquadV2Processor()
        examples = processor.get_dev_examples(data_dir)

        features = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
        )
    """

    # Defining helper methods
    features = []
    threads = min(threads, cpu_count())
    if threads == 1:
        print("squad_convert_examples_to_features")
        features = []
        for eg in tqdm(examples, total=len(examples), desc="convert squad examples to features"):
            feat = squad_convert_example_to_features_sp(
                eg,
                max_seq_length=max_seq_length,
                doc_stride=doc_stride,
                max_query_length=max_query_length,
                is_training=is_training,
                tokenizer_for_convert=tokenizer)
            features.append(feat)

    else:
        print("squad_convert_examples_to_features w/ {} threads".format(threads))
        with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
            annotate_ = partial(
                squad_convert_example_to_features,
                max_seq_length=max_seq_length,
                doc_stride=doc_stride,
                max_query_length=max_query_length,
                is_training=is_training,
            )
            features = list(
                tqdm(
                    p.imap(annotate_, examples, chunksize=32),
                    total=len(examples),
                    desc="convert squad examples to features",
                )
            )

    new_features = []
    unique_id = 1000000000
    example_index = 0
    for example_features in tqdm(features, total=len(features), desc="add example index and unique id"):
        if not example_features:
            continue
        for example_feature in example_features:
            example_feature.example_index = example_index
            example_feature.unique_id = unique_id
            new_features.append(example_feature)
            unique_id += 1
        example_index += 1
    features = new_features
    del new_features
    if return_dataset == "pt":
        if not is_torch_available():
            raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.")

        # Convert to Tensors and build dataset
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)

        if not is_training:
            all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
            dataset = TensorDataset(
                all_input_ids, all_attention_masks, all_token_type_ids, all_example_index, all_cls_index, all_p_mask
            )
        else:
            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
            dataset = TensorDataset(
                all_input_ids,
                all_attention_masks,
                all_token_type_ids,
                all_start_positions,
                all_end_positions,
                all_cls_index,
                all_p_mask,
            )

        return features, dataset
    elif return_dataset == "tf":
        if not is_tf_available():
            raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.")

        def gen():
            for ex in features:
                yield (
                    {
                        "input_ids": ex.input_ids,
                        "attention_mask": ex.attention_mask,
                        "token_type_ids": ex.token_type_ids,
                    },
                    {
                        "start_position": ex.start_position,
                        "end_position": ex.end_position,
                        "cls_index": ex.cls_index,
                        "p_mask": ex.p_mask,
                    },
                )

        return tf.data.Dataset.from_generator(
            gen,
            (
                {"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32},
                {"start_position": tf.int64, "end_position": tf.int64, "cls_index": tf.int64, "p_mask": tf.int32},
            ),
            (
                {
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None]),
                    "token_type_ids": tf.TensorShape([None]),
                },
                {
                    "start_position": tf.TensorShape([]),
                    "end_position": tf.TensorShape([]),
                    "cls_index": tf.TensorShape([]),
                    "p_mask": tf.TensorShape([None]),
                },
            ),
        )

    return features
class TFSpeech2TextModelTest(TFModelTesterMixin, unittest.TestCase):
    all_model_classes = (
        TFSpeech2TextModel,
        TFSpeech2TextForConditionalGeneration) if is_tf_available() else ()
    all_generative_model_classes = (
        TFSpeech2TextForConditionalGeneration, ) if is_tf_available() else ()
    is_encoder_decoder = True
    test_pruning = False
    test_missing_keys = False
    test_onnx = False

    input_name = "input_ids"

    def setUp(self):
        self.model_tester = TFSpeech2TextModelTester(self)
        self.config_tester = ConfigTester(self, config_class=Speech2TextConfig)
        self.maxDiff = 3000

    def test_config(self):
        self.config_tester.run_common_tests()

    def test_decoder_model_past_with_large_inputs(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_decoder_model_past_large_inputs(
            *config_and_inputs)

    # not implemented currently
    def test_inputs_embeds(self):
        pass

    # training is not supported yet
    def test_training(self):
        pass

    def test_training_gradient_checkpointing(self):
        pass

    def test_generate_fp16(self):
        pass

    def test_hidden_states_output(self):
        def check_hidden_states_output(inputs_dict, config, model_class):
            model = model_class(config)
            outputs = model(
                **self._prepare_for_class(inputs_dict, model_class))

            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states

            expected_num_layers = getattr(
                self.model_tester, "expected_num_hidden_layers",
                self.model_tester.num_hidden_layers + 1)
            self.assertEqual(len(hidden_states), expected_num_layers)

            if hasattr(self.model_tester, "encoder_seq_length"):
                seq_length = self.model_tester.encoder_seq_length
            else:
                seq_length = self.model_tester.seq_length

            subsampled_seq_length = model._get_feat_extract_output_lengths(
                seq_length)

            self.assertListEqual(
                list(hidden_states[0].shape[-2:]),
                [subsampled_seq_length, self.model_tester.hidden_size],
            )

            if config.is_encoder_decoder:
                hidden_states = outputs.decoder_hidden_states

                self.assertIsInstance(hidden_states, (list, tuple))
                self.assertEqual(len(hidden_states), expected_num_layers)
                seq_len = getattr(self.model_tester, "seq_length", None)
                decoder_seq_length = getattr(self.model_tester,
                                             "decoder_seq_length", seq_len)

                self.assertListEqual(
                    list(hidden_states[0].shape[-2:]),
                    [decoder_seq_length, self.model_tester.hidden_size],
                )

        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
        )

        for model_class in self.all_model_classes:
            inputs_dict["output_hidden_states"] = True
            check_hidden_states_output(inputs_dict, config, model_class)

            # check that output_hidden_states also work using config
            del inputs_dict["output_hidden_states"]
            config.output_hidden_states = True

            check_hidden_states_output(inputs_dict, config, model_class)

    def test_attention_outputs(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
        )
        config.return_dict = True

        seq_len = getattr(self.model_tester, "seq_length", None)
        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length",
                                     seq_len)
        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length",
                                     seq_len)
        decoder_key_length = getattr(self.model_tester, "decoder_key_length",
                                     decoder_seq_length)
        encoder_key_length = getattr(self.model_tester, "key_length",
                                     encoder_seq_length)

        for model_class in self.all_model_classes:
            inputs_dict["output_attentions"] = True
            inputs_dict["output_hidden_states"] = False
            config.return_dict = True
            model = model_class(config)

            subsampled_encoder_seq_length = model._get_feat_extract_output_lengths(
                encoder_seq_length)
            subsampled_encoder_key_length = model._get_feat_extract_output_lengths(
                encoder_key_length)

            outputs = model(
                **self._prepare_for_class(inputs_dict, model_class))
            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
            self.assertEqual(len(attentions),
                             self.model_tester.num_hidden_layers)

            # check that output_attentions also work using config
            del inputs_dict["output_attentions"]
            config.output_attentions = True
            model = model_class(config)

            outputs = model(
                **self._prepare_for_class(inputs_dict, model_class))
            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
            self.assertEqual(len(attentions),
                             self.model_tester.num_hidden_layers)

            self.assertListEqual(
                list(attentions[0].shape[-3:]),
                [
                    self.model_tester.num_attention_heads,
                    subsampled_encoder_seq_length,
                    subsampled_encoder_key_length
                ],
            )
            out_len = len(outputs)

            correct_outlen = 5

            # loss is at first position
            if "labels" in inputs_dict:
                correct_outlen += 1  # loss is added to beginning
            if "past_key_values" in outputs:
                correct_outlen += 1  # past_key_values have been returned

            self.assertEqual(out_len, correct_outlen)

            # decoder attentions
            decoder_attentions = outputs.decoder_attentions
            self.assertIsInstance(decoder_attentions, (list, tuple))
            self.assertEqual(len(decoder_attentions),
                             self.model_tester.num_hidden_layers)
            self.assertListEqual(
                list(decoder_attentions[0].shape[-3:]),
                [
                    self.model_tester.num_attention_heads, decoder_seq_length,
                    decoder_key_length
                ],
            )

            # cross attentions
            cross_attentions = outputs.cross_attentions
            self.assertIsInstance(cross_attentions, (list, tuple))
            self.assertEqual(len(cross_attentions),
                             self.model_tester.num_hidden_layers)
            self.assertListEqual(
                list(cross_attentions[0].shape[-3:]),
                [
                    self.model_tester.num_attention_heads,
                    decoder_seq_length,
                    subsampled_encoder_key_length,
                ],
            )

            # Check attention is always last and order is fine
            inputs_dict["output_attentions"] = True
            inputs_dict["output_hidden_states"] = True
            model = model_class(config)

            outputs = model(
                **self._prepare_for_class(inputs_dict, model_class))

            added_hidden_states = 2
            self.assertEqual(out_len + added_hidden_states, len(outputs))

            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions

            self.assertEqual(len(self_attentions),
                             self.model_tester.num_hidden_layers)
            self.assertListEqual(
                list(self_attentions[0].shape[-3:]),
                [
                    self.model_tester.num_attention_heads,
                    subsampled_encoder_seq_length,
                    subsampled_encoder_key_length
                ],
            )

    def test_resize_token_embeddings(self):
        # Overwritten method from parent; see `test_resize_embeddings_untied`
        pass

    def test_resize_tokens_embeddings(self):
        # see `test_resize_embeddings_untied`
        pass

    def test_resize_embeddings_untied(self):
        # TODO: copy test from PT. Not working at the moment because the test relies on `model.resize_token_embeddings`,
        # whose TF implementation assumes the use of `TFWrappedEmbeddings`. But with a `TFWrappedEmbeddings` we can't
        # load the weights from PT (also, it induces TF1 behavior, so we might want to rework how
        # `model.resize_token_embeddings` operates).
        pass

    def test_generate_without_input_ids(self):
        pass

    @staticmethod
    def _get_encoder_outputs(model,
                             input_ids,
                             attention_mask,
                             output_attentions=None,
                             output_hidden_states=None,
                             num_interleave=1):
        encoder = model.get_encoder()
        encoder_outputs = encoder(
            input_ids,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )
        encoder_outputs["last_hidden_state"] = tf.repeat(
            encoder_outputs.last_hidden_state, num_interleave, axis=0)

        input_ids = input_ids[:, :, 0]
        input_ids = tf.zeros_like(
            input_ids[:, :1],
            dtype=tf.int64) + model._get_decoder_start_token_id()
        attention_mask = None
        return encoder_outputs, input_ids, attention_mask

    def _check_outputs(self,
                       output,
                       input_ids,
                       config,
                       use_cache=False,
                       num_return_sequences=1):
        batch_size, seq_length = input_ids.shape[:2]
        subsampled_seq_length = self.model_tester.get_subsampled_output_lengths(
            seq_length)
        num_sequences_in_output = batch_size * num_return_sequences
        gen_len = (output.sequences.shape[-1] - 1 if config.is_encoder_decoder
                   else output.sequences.shape[-1] - seq_length)

        # scores
        self._check_scores(num_sequences_in_output,
                           output.scores,
                           length=gen_len,
                           config=config)

        # Attentions
        # encoder
        self._check_encoder_attention_for_generate(output.encoder_attentions,
                                                   batch_size, config,
                                                   subsampled_seq_length)
        # decoder
        self._check_attentions_for_generate(
            num_sequences_in_output,
            output.decoder_attentions,
            min_length=1,
            max_length=output.sequences.shape[-1],
            config=config,
            use_cache=use_cache,
        )

        # Hidden States
        # encoder
        self._check_encoder_hidden_states_for_generate(
            output.encoder_hidden_states, batch_size, config,
            subsampled_seq_length)

        # decoder
        self._check_hidden_states_for_generate(
            num_sequences_in_output,
            output.decoder_hidden_states,
            min_length=1,
            max_length=output.sequences.shape[-1],
            config=config,
            use_cache=use_cache,
        )

    # overwritten from parent due to the inability to work when non-text inputs are not passed AND because the input is
    # `input_features`
    def test_lm_head_model_random_no_beam_search_generate(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
        )
        input_features = inputs_dict.get("input_features", None)

        # iterate over all generative models
        for model_class in self.all_generative_model_classes:
            model = model_class(config)

            if config.bos_token_id is None:
                # if bos token id is not defined model needs input_features
                with self.assertRaises(AssertionError):
                    model.generate(do_sample=True, max_length=5)
                # num_return_sequences = 1
                self._check_generated_ids(
                    model.generate(input_features, do_sample=True))

            with self.assertRaises(AssertionError):
                # generating multiple sequences when no beam search generation
                # is not allowed as it would always generate the same sequences
                model.generate(input_features,
                               do_sample=False,
                               num_return_sequences=2)

            # num_return_sequences > 1, sample
            self._check_generated_ids(
                model.generate(input_features,
                               do_sample=True,
                               num_return_sequences=2))

            # check bad words tokens language generation
            # create list of 1-seq bad token and list of 2-seq of bad tokens
            bad_words_ids = [
                self._generate_random_bad_tokens(1, model),
                self._generate_random_bad_tokens(2, model)
            ]
            output_tokens = model.generate(input_features,
                                           do_sample=True,
                                           bad_words_ids=bad_words_ids,
                                           num_return_sequences=2)
            # only count generated tokens
            generated_ids = output_tokens[:, input_features.shape[-1]:]
            self.assertFalse(
                self._check_match_tokens(generated_ids.numpy().tolist(),
                                         bad_words_ids))

    # overwritten from parent due to the inability to work when non-text inputs are not passed AND because the input is
    # `input_features`
    def test_lm_head_model_random_beam_search_generate(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
        )
        input_features = inputs_dict.get("input_features", None)

        for model_class in self.all_generative_model_classes:
            model = model_class(config)

            if config.bos_token_id is None:
                # if bos token id is not defined model needs input_ids, num_return_sequences = 1
                self._check_generated_ids(
                    model.generate(input_features, do_sample=True,
                                   num_beams=2))

            with self.assertRaises(AssertionError):
                # generating more sequences than having beams leads is not possible
                model.generate(input_features,
                               do_sample=False,
                               num_return_sequences=3,
                               num_beams=2)

            # num_return_sequences > 1, sample
            self._check_generated_ids(
                model.generate(
                    input_features,
                    do_sample=True,
                    num_beams=2,
                    num_return_sequences=2,
                ))
            # num_return_sequences > 1, greedy
            self._check_generated_ids(
                model.generate(input_features,
                               do_sample=False,
                               num_beams=2,
                               num_return_sequences=2))

            # check bad words tokens language generation
            # create list of 1-seq bad token and list of 2-seq of bad tokens
            bad_words_ids = [
                self._generate_random_bad_tokens(1, model),
                self._generate_random_bad_tokens(2, model)
            ]
            output_tokens = model.generate(input_features,
                                           do_sample=False,
                                           bad_words_ids=bad_words_ids,
                                           num_beams=2,
                                           num_return_sequences=2)
            # only count generated tokens
            generated_ids = output_tokens[:, input_features.shape[-1]:]
            self.assertFalse(
                self._check_match_tokens(generated_ids.numpy().tolist(),
                                         bad_words_ids))

    # overwritten from parent -- the input is `input_features`, not `input_ids`
    def test_forward_signature(self):
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            model = model_class(config)
            signature = inspect.signature(model.call)
            # signature.parameters is an OrderedDict => so arg_names order is deterministic
            arg_names = [*signature.parameters.keys()]

            expected_arg_names = [
                "input_features",
                "attention_mask",
                "decoder_input_ids",
                "decoder_attention_mask",
            ]
            self.assertListEqual(arg_names[:len(expected_arg_names)],
                                 expected_arg_names)
class TFViTModelTest(TFModelTesterMixin, unittest.TestCase):
    """
    Here we also overwrite some of the tests of test_modeling_tf_common.py, as ViT does not use input_ids, inputs_embeds,
    attention_mask and seq_length.
    """

    all_model_classes = (
        TFViTModel, TFViTForImageClassification) if is_tf_available() else ()

    test_resize_embeddings = False
    test_head_masking = False
    test_onnx = False

    def setUp(self):
        self.model_tester = TFViTModelTester(self)
        self.config_tester = ConfigTester(self,
                                          config_class=ViTConfig,
                                          has_text_modality=False,
                                          hidden_size=37)

    def test_config(self):
        self.config_tester.run_common_tests()

    def test_inputs_embeds(self):
        # ViT does not use inputs_embeds
        pass

    def test_graph_mode_with_inputs_embeds(self):
        # ViT does not use inputs_embeds
        pass

    def test_model_common_attributes(self):
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            model = model_class(config)
            self.assertIsInstance(model.get_input_embeddings(),
                                  (tf.keras.layers.Layer))
            x = model.get_output_embeddings()
            self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer))

    def test_forward_signature(self):
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            model = model_class(config)
            signature = inspect.signature(model.call)
            # signature.parameters is an OrderedDict => so arg_names order is deterministic
            arg_names = [*signature.parameters.keys()]

            expected_arg_names = ["pixel_values"]
            self.assertListEqual(arg_names[:1], expected_arg_names)

    def test_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

    # overwrite from common since `encoder_seq_length` and `encoder_key_length` are calculated
    # in a different way than in text models.
    @tooslow
    def test_saved_model_creation_extended(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
        )
        config.output_hidden_states = True
        config.output_attentions = True

        if hasattr(config, "use_cache"):
            config.use_cache = True

        # in ViT, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
        image_size = to_2tuple(self.model_tester.image_size)
        patch_size = to_2tuple(self.model_tester.patch_size)
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] //
                                                          patch_size[0])
        seq_len = num_patches + 1
        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length",
                                     seq_len)
        encoder_key_length = getattr(self.model_tester, "key_length",
                                     encoder_seq_length)

        for model_class in self.all_model_classes:
            class_inputs_dict = self._prepare_for_class(
                inputs_dict, model_class)
            model = model_class(config)
            num_out = len(model(class_inputs_dict))

            with tempfile.TemporaryDirectory() as tmpdirname:
                model.save_pretrained(tmpdirname, saved_model=True)
                saved_model_dir = os.path.join(tmpdirname, "saved_model", "1")
                model = tf.keras.models.load_model(saved_model_dir)
                outputs = model(class_inputs_dict)

                if self.is_encoder_decoder:
                    output_hidden_states = outputs["encoder_hidden_states"]
                    output_attentions = outputs["encoder_attentions"]
                else:
                    output_hidden_states = outputs["hidden_states"]
                    output_attentions = outputs["attentions"]

                self.assertEqual(len(outputs), num_out)

                expected_num_layers = getattr(
                    self.model_tester, "expected_num_hidden_layers",
                    self.model_tester.num_hidden_layers + 1)

                self.assertEqual(len(output_hidden_states),
                                 expected_num_layers)
                self.assertListEqual(
                    list(output_hidden_states[0].shape[-2:]),
                    [seq_len, self.model_tester.hidden_size],
                )

                self.assertEqual(len(output_attentions),
                                 self.model_tester.num_hidden_layers)
                self.assertListEqual(
                    list(output_attentions[0].shape[-3:]),
                    [
                        self.model_tester.num_attention_heads,
                        encoder_seq_length, encoder_key_length
                    ],
                )

    def test_attention_outputs(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
        )
        config.return_dict = True

        # in ViT, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
        image_size = to_2tuple(self.model_tester.image_size)
        patch_size = to_2tuple(self.model_tester.patch_size)
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] //
                                                          patch_size[0])
        seq_len = num_patches + 1
        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length",
                                     seq_len)
        encoder_key_length = getattr(self.model_tester, "key_length",
                                     encoder_seq_length)

        for model_class in self.all_model_classes:
            inputs_dict["output_attentions"] = True
            inputs_dict["output_hidden_states"] = False
            config.return_dict = True
            model = model_class(config)
            outputs = model(**self._prepare_for_class(inputs_dict,
                                                      model_class),
                            training=False)
            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
            self.assertEqual(len(attentions),
                             self.model_tester.num_hidden_layers)

            # check that output_attentions also work using config
            del inputs_dict["output_attentions"]
            config.output_attentions = True
            model = model_class(config)
            outputs = model(**self._prepare_for_class(inputs_dict,
                                                      model_class),
                            training=False)
            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
            self.assertEqual(len(attentions),
                             self.model_tester.num_hidden_layers)

            self.assertListEqual(
                list(attentions[0].shape[-3:]),
                [
                    self.model_tester.num_attention_heads, encoder_seq_length,
                    encoder_key_length
                ],
            )
            out_len = len(outputs)

            # Check attention is always last and order is fine
            inputs_dict["output_attentions"] = True
            inputs_dict["output_hidden_states"] = True
            model = model_class(config)
            outputs = model(**self._prepare_for_class(inputs_dict,
                                                      model_class),
                            training=False)

            if hasattr(self.model_tester, "num_hidden_states_types"):
                added_hidden_states = self.model_tester.num_hidden_states_types
            elif self.is_encoder_decoder:
                added_hidden_states = 2
            else:
                added_hidden_states = 1
            self.assertEqual(out_len + added_hidden_states, len(outputs))

            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions

            self.assertEqual(len(self_attentions),
                             self.model_tester.num_hidden_layers)
            self.assertListEqual(
                list(self_attentions[0].shape[-3:]),
                [
                    self.model_tester.num_attention_heads, encoder_seq_length,
                    encoder_key_length
                ],
            )

    def test_hidden_states_output(self):
        def check_hidden_states_output(inputs_dict, config, model_class):
            model = model_class(config)

            outputs = model(
                **self._prepare_for_class(inputs_dict, model_class))

            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states

            expected_num_layers = getattr(
                self.model_tester, "expected_num_hidden_layers",
                self.model_tester.num_hidden_layers + 1)
            self.assertEqual(len(hidden_states), expected_num_layers)

            # ViT has a different seq_length
            image_size = to_2tuple(self.model_tester.image_size)
            patch_size = to_2tuple(self.model_tester.patch_size)
            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] //
                                                              patch_size[0])
            seq_length = num_patches + 1

            self.assertListEqual(
                list(hidden_states[0].shape[-2:]),
                [seq_length, self.model_tester.hidden_size],
            )

        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
        )

        for model_class in self.all_model_classes:
            inputs_dict["output_hidden_states"] = True
            check_hidden_states_output(inputs_dict, config, model_class)

            # check that output_hidden_states also work using config
            del inputs_dict["output_hidden_states"]
            config.output_hidden_states = True

            check_hidden_states_output(inputs_dict, config, model_class)

    def test_for_image_classification(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_for_image_classification(
            *config_and_inputs)

    @slow
    def test_model_from_pretrained(self):

        model = TFViTModel.from_pretrained("google/vit-base-patch16-224")
        self.assertIsNotNone(model)
Exemplo n.º 25
0
import tempfile
import unittest

from transformers.file_utils import cached_property, is_datasets_available, is_faiss_available, is_tf_available
from transformers.testing_utils import require_sentencepiece, require_tf, require_tokenizers, slow

if is_tf_available() and is_datasets_available() and is_faiss_available():
    import tensorflow as tf

    from transformers import (
        AutoConfig,
        BartTokenizer,
        DPRQuestionEncoderTokenizer,
        RagConfig,
        RagRetriever,
        RagTokenizer,
        TFBartForConditionalGeneration,
        TFDPRQuestionEncoder,
        TFRagModel,
        TFRagSequenceForGeneration,
        TFRagTokenForGeneration,
    )


def require_retrieval(test_case):
    """
    Decorator marking a test that requires a set of dependencies necessary for pefrorm retrieval with
    :class:`~transformers.RagRetriever`.

    These tests are skipped when respective libraries are not installed.
Exemplo n.º 26
0
class TFRagTestMixin:

    all_model_classes = ((TFRagModel, TFRagTokenForGeneration,
                          TFRagSequenceForGeneration)
                         if is_tf_available() and is_datasets_available()
                         and is_faiss_available() else ())
    all_generative_model_classes = (
        (TFRagTokenForGeneration,
         TFRagSequenceForGeneration) if is_tf_available()
        and is_datasets_available() and is_faiss_available() else ())

    retrieval_vector_size = 32
    n_docs = 3
    max_combined_length = 16

    def setUp(self):
        self.tmpdirname = tempfile.mkdtemp()

        # DPR tok
        vocab_tokens = [
            "[UNK]",
            "[CLS]",
            "[SEP]",
            "[PAD]",
            "[MASK]",
            "want",
            "##want",
            "##ed",
            "wa",
            "un",
            "runn",
            "##ing",
            ",",
            "low",
            "lowest",
        ]
        dpr_tokenizer_path = os.path.join(self.tmpdirname, "dpr_tokenizer")
        os.makedirs(dpr_tokenizer_path, exist_ok=True)
        self.vocab_file = os.path.join(dpr_tokenizer_path,
                                       DPR_VOCAB_FILES_NAMES["vocab_file"])
        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))

        # BART tok
        vocab = [
            "l",
            "o",
            "w",
            "e",
            "r",
            "s",
            "t",
            "i",
            "d",
            "n",
            "\u0120",
            "\u0120l",
            "\u0120n",
            "\u0120lo",
            "\u0120low",
            "er",
            "\u0120lowest",
            "\u0120newer",
            "\u0120wider",
            "<unk>",
        ]
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
        merges = [
            "#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""
        ]
        self.special_tokens_map = {"unk_token": "<unk>"}

        bart_tokenizer_path = os.path.join(self.tmpdirname, "bart_tokenizer")
        os.makedirs(bart_tokenizer_path, exist_ok=True)
        self.vocab_file = os.path.join(bart_tokenizer_path,
                                       BART_VOCAB_FILES_NAMES["vocab_file"])
        self.merges_file = os.path.join(bart_tokenizer_path,
                                        BART_VOCAB_FILES_NAMES["merges_file"])
        with open(self.vocab_file, "w", encoding="utf-8") as fp:
            fp.write(json.dumps(vocab_tokens) + "\n")
        with open(self.merges_file, "w", encoding="utf-8") as fp:
            fp.write("\n".join(merges))

    @cached_property
    def dpr_tokenizer(self) -> DPRQuestionEncoderTokenizer:
        return DPRQuestionEncoderTokenizer.from_pretrained(
            os.path.join(self.tmpdirname, "dpr_tokenizer"))

    @cached_property
    def bart_tokenizer(self) -> BartTokenizer:
        return BartTokenizer.from_pretrained(
            os.path.join(self.tmpdirname, "bart_tokenizer"))

    def tearDown(self):
        shutil.rmtree(self.tmpdirname)

    def get_retriever(self, config):
        dataset = Dataset.from_dict({
            "id": ["0", "1", "3"],
            "text": ["foo", "bar", "qux"],
            "title": ["Foo", "Bar", "Qux"],
            "embeddings": [
                np.ones(self.retrieval_vector_size),
                2 * np.ones(self.retrieval_vector_size),
                3 * np.ones(self.retrieval_vector_size),
            ],
        })
        dataset.add_faiss_index("embeddings",
                                string_factory="Flat",
                                metric_type=faiss.METRIC_INNER_PRODUCT)
        tokenizer = self.bart_tokenizer
        with patch("transformers.models.rag.retrieval_rag.load_dataset"
                   ) as mock_load_dataset:
            mock_load_dataset.return_value = dataset
            retriever = RagRetriever(
                config,
                question_encoder_tokenizer=self.dpr_tokenizer,
                generator_tokenizer=tokenizer,
            )
        return retriever

    def check_model_with_retriever(self, config, input_ids, attention_mask,
                                   decoder_input_ids, decoder_attention_mask,
                                   **kwargs):
        self.assertIsNotNone(config.question_encoder)
        self.assertIsNotNone(config.generator)

        for model_class in self.all_model_classes:
            model = model_class(config, retriever=self.get_retriever(config))

            self.assertTrue(model.config.is_encoder_decoder)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                decoder_input_ids=decoder_input_ids,
                decoder_attention_mask=decoder_attention_mask,
            )

            # logits
            self.assertEqual(
                outputs.logits.shape,
                (self.n_docs * decoder_input_ids.shape[0],
                 decoder_input_ids.shape[1], config.generator.vocab_size),
            )
            # generator encoder last hidden states
            self.assertEqual(
                outputs.generator_enc_last_hidden_state.shape,
                (self.n_docs * decoder_input_ids.shape[0],
                 self.max_combined_length, config.generator.hidden_size),
            )
            # doc scores
            self.assertEqual(outputs.doc_scores.shape,
                             (input_ids.shape[0], self.n_docs))

    def check_model_generate_from_context_input_ids(self, config, input_ids,
                                                    attention_mask,
                                                    decoder_input_ids,
                                                    decoder_attention_mask,
                                                    **kwargs):
        self.assertIsNotNone(config.question_encoder)
        self.assertIsNotNone(config.generator)

        retriever = self.get_retriever(config)

        for i, model_class in enumerate(self.all_generative_model_classes):
            model = model_class(config)
            self.assertTrue(model.config.is_encoder_decoder)

            question_hidden_states = model.question_encoder(
                input_ids, attention_mask=attention_mask)[0]

            out = retriever(
                input_ids,
                question_hidden_states.numpy(),
                prefix=config.generator.prefix,
                return_tensors="tf",
            )

            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
                out["context_input_ids"],
                out["context_attention_mask"],
                out["retrieved_doc_embeds"],
            )
            retrieved_doc_embeds = tf.cast(retrieved_doc_embeds, tf.float32)

            # compute doc_scores
            doc_scores = tf.squeeze(
                tf.matmul(tf.expand_dims(question_hidden_states, axis=[1]),
                          retrieved_doc_embeds,
                          transpose_b=True),
                axis=[1],
            )

            outputs = model.generate(
                context_input_ids=context_input_ids,
                context_attention_mask=context_attention_mask,
                doc_scores=doc_scores,
            )

            self.assertIsNotNone(outputs)

    def check_model_generate(self, config, input_ids, attention_mask,
                             decoder_input_ids, decoder_attention_mask,
                             **kwargs):
        self.assertIsNotNone(config.question_encoder)
        self.assertIsNotNone(config.generator)

        for model_class in self.all_generative_model_classes:
            model = model_class(config, retriever=self.get_retriever(config))

            self.assertTrue(model.config.is_encoder_decoder)

            input_ids = tf.cast(input_ids, tf.int32)
            outputs = model.generate(
                input_ids=input_ids,
                num_beams=2,
                num_return_sequences=2,
                decoder_start_token_id=config.generator.eos_token_id,
            )

            self.assertIsNotNone(outputs)

    def check_model_without_retriever(self, config, input_ids, attention_mask,
                                      decoder_input_ids,
                                      decoder_attention_mask, **kwargs):
        self.assertIsNotNone(config.question_encoder)
        self.assertIsNotNone(config.generator)

        retriever = self.get_retriever(config)

        for model_class in self.all_model_classes:
            model = model_class(config)
            self.assertTrue(model.config.is_encoder_decoder)

            question_hidden_states = model.question_encoder(
                input_ids, attention_mask=attention_mask)[0]

            out = retriever(
                input_ids,
                question_hidden_states.numpy(),
                prefix=config.generator.prefix,
                return_tensors="tf",
            )

            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
                out["context_input_ids"],
                out["context_attention_mask"],
                out["retrieved_doc_embeds"],
            )

            retrieved_doc_embeds = tf.cast(retrieved_doc_embeds, tf.float32)

            # compute doc_scores
            doc_scores = tf.squeeze(
                tf.matmul(tf.expand_dims(question_hidden_states, axis=[1]),
                          retrieved_doc_embeds,
                          transpose_b=True),
                axis=[1],
            )

            outputs = model(
                input_ids=None,
                context_input_ids=context_input_ids,
                context_attention_mask=context_attention_mask,
                doc_scores=doc_scores,
                decoder_input_ids=decoder_input_ids,
                decoder_attention_mask=decoder_attention_mask,
            )

            # logits
            self.assertEqual(
                outputs.logits.shape,
                (self.n_docs * decoder_input_ids.shape[0],
                 decoder_input_ids.shape[1], config.generator.vocab_size),
            )

            # generator encoder last hidden states
            self.assertEqual(
                outputs.generator_enc_last_hidden_state.shape,
                (self.n_docs * decoder_input_ids.shape[0],
                 self.max_combined_length, config.generator.hidden_size),
            )
            # doc scores
            self.assertEqual(outputs.doc_scores.shape,
                             (input_ids.shape[0], self.n_docs))

    def check_model_custom_n_docs(self, config, input_ids, attention_mask,
                                  decoder_input_ids, decoder_attention_mask,
                                  n_docs, **kwargs):
        self.assertIsNotNone(config.question_encoder)
        self.assertIsNotNone(config.generator)

        retriever = self.get_retriever(config)

        for model_class in self.all_model_classes:
            model = model_class(config)
            self.assertTrue(model.config.is_encoder_decoder)

            question_hidden_states = model.question_encoder(
                input_ids, attention_mask=attention_mask)[0]

            out = retriever(
                input_ids,
                question_hidden_states.numpy(),
                prefix=config.generator.prefix,
                return_tensors="tf",
                n_docs=n_docs,
            )

            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
                out["context_input_ids"],
                out["context_attention_mask"],
                out["retrieved_doc_embeds"],
            )

            retrieved_doc_embeds = tf.cast(retrieved_doc_embeds, tf.float32)

            # compute doc_scores
            doc_scores = tf.squeeze(
                tf.matmul(tf.expand_dims(question_hidden_states, axis=[1]),
                          retrieved_doc_embeds,
                          transpose_b=True),
                axis=[1],
            )

            outputs = model(
                input_ids=None,
                context_input_ids=context_input_ids,
                context_attention_mask=context_attention_mask,
                doc_scores=doc_scores,
                decoder_input_ids=decoder_input_ids,
                decoder_attention_mask=decoder_attention_mask,
                n_docs=n_docs,
            )

            # logits
            self.assertEqual(
                outputs.logits.shape,
                (n_docs * decoder_input_ids.shape[0],
                 decoder_input_ids.shape[1], config.generator.vocab_size),
            )
            # generator encoder last hidden states
            self.assertEqual(
                outputs.generator_enc_last_hidden_state.shape,
                (n_docs * decoder_input_ids.shape[0], self.max_combined_length,
                 config.generator.hidden_size),
            )
            # doc scores
            self.assertEqual(outputs.doc_scores.shape,
                             (input_ids.shape[0], n_docs))

    def check_model_with_mismatch_n_docs_value(self, config, input_ids,
                                               attention_mask,
                                               decoder_input_ids,
                                               decoder_attention_mask,
                                               retriever_n_docs,
                                               generator_n_docs, **kwargs):
        self.assertIsNotNone(config.question_encoder)
        self.assertIsNotNone(config.generator)

        retriever = self.get_retriever(config)

        for model_class in self.all_model_classes:
            model = model_class(config)
            self.assertTrue(model.config.is_encoder_decoder)

            question_hidden_states = model.question_encoder(
                input_ids, attention_mask=attention_mask)[0]

            out = retriever(
                input_ids,
                question_hidden_states.numpy(),
                prefix=config.generator.prefix,
                return_tensors="tf",
                n_docs=retriever_n_docs,
            )

            context_input_ids, context_attention_mask, retrieved_doc_embeds = (
                out["context_input_ids"],
                out["context_attention_mask"],
                out["retrieved_doc_embeds"],
            )

            retrieved_doc_embeds = tf.cast(retrieved_doc_embeds, tf.float32)

            # compute doc_scores
            doc_scores = tf.squeeze(
                tf.matmul(tf.expand_dims(question_hidden_states, axis=[1]),
                          retrieved_doc_embeds,
                          transpose_b=True),
                axis=[1],
            )

            self.assertRaises(
                AssertionError,
                model.__call__,
                input_ids=None,
                context_input_ids=context_input_ids,
                context_attention_mask=context_attention_mask,
                doc_scores=doc_scores,
                decoder_input_ids=decoder_input_ids,
                decoder_attention_mask=decoder_attention_mask,
                n_docs=generator_n_docs,
            )

    def check_model_with_encoder_outputs(self, config, input_ids,
                                         attention_mask, decoder_input_ids,
                                         decoder_attention_mask, **kwargs):
        self.assertIsNotNone(config.question_encoder)
        self.assertIsNotNone(config.generator)

        for model_class in self.all_model_classes:
            model = model_class(config, retriever=self.get_retriever(config))

            self.assertTrue(model.config.is_encoder_decoder)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                decoder_input_ids=decoder_input_ids,
                decoder_attention_mask=decoder_attention_mask,
            )

            encoder_outputs = TFBaseModelOutput(
                outputs.generator_enc_last_hidden_state)

            # run only generator
            outputs = model(
                input_ids=None,
                encoder_outputs=encoder_outputs,
                doc_scores=outputs.doc_scores,
                decoder_input_ids=decoder_input_ids,
                decoder_attention_mask=decoder_attention_mask,
            )

            # logits
            self.assertEqual(
                outputs.logits.shape,
                (self.n_docs * decoder_input_ids.shape[0],
                 decoder_input_ids.shape[1], config.generator.vocab_size),
            )
            # generator encoder last hidden states
            self.assertEqual(
                outputs.generator_enc_last_hidden_state.shape,
                (self.n_docs * decoder_input_ids.shape[0],
                 self.max_combined_length, config.generator.hidden_size),
            )
            # doc scores
            self.assertEqual(outputs.doc_scores.shape,
                             (input_ids.shape[0], self.n_docs))

    def test_model_with_retriever(self):
        inputs_dict = self.config_and_inputs
        self.check_model_with_retriever(**inputs_dict)

    def test_model_without_retriever(self):
        inputs_dict = self.config_and_inputs
        self.check_model_without_retriever(**inputs_dict)

    def test_model_generate_from_context_input_ids(self):
        inputs_dict = self.config_and_inputs
        self.check_model_generate_from_context_input_ids(**inputs_dict)

    def test_model_with_encoder_outputs(self):
        inputs_dict = self.config_and_inputs
        self.check_model_with_encoder_outputs(**inputs_dict)

    def test_model_generate(self):
        inputs_dict = self.config_and_inputs
        self.check_model_generate(**inputs_dict)

    def test_model_with_custom_n_docs(self):
        inputs_dict = self.config_and_inputs
        inputs_dict["n_docs"] = 1
        self.check_model_custom_n_docs(**inputs_dict)

    def test_model_with_mismatch_n_docs_value(self):
        inputs_dict = self.config_and_inputs
        inputs_dict["retriever_n_docs"] = 3
        inputs_dict["generator_n_docs"] = 2
        self.check_model_with_mismatch_n_docs_value(**inputs_dict)
# limitations under the License.
""" Testing suite for the TensorFlow ViT model. """

import inspect
import os
import tempfile
import unittest

from transformers import ViTConfig
from transformers.file_utils import cached_property, is_tf_available, is_vision_available
from transformers.testing_utils import require_tf, require_vision, slow, tooslow

from .test_configuration_common import ConfigTester
from .test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor

if is_tf_available():
    import tensorflow as tf

    from transformers import TFViTForImageClassification, TFViTModel
    from transformers.models.vit.modeling_tf_vit import to_2tuple

if is_vision_available():
    from PIL import Image

    from transformers import ViTFeatureExtractor


class TFViTModelTester:
    def __init__(
        self,
        parent,
Exemplo n.º 28
0
class TFData2VecVisionModelTest(TFModelTesterMixin, unittest.TestCase):
    """
    Here we also overwrite some of the tests of test_modeling_common.py, as Data2VecVision does not use input_ids, inputs_embeds,
    attention_mask and seq_length.
    """

    all_model_classes = ((TFData2VecVisionModel,
                          TFData2VecVisionForImageClassification,
                          TFData2VecVisionForSemanticSegmentation)
                         if is_tf_available() else ())

    test_pruning = False
    test_onnx = False
    test_resize_embeddings = False
    test_head_masking = False

    def setUp(self):
        self.model_tester = TFData2VecVisionModelTester(self)
        self.config_tester = ConfigTester(self,
                                          config_class=Data2VecVisionConfig,
                                          has_text_modality=False,
                                          hidden_size=37)

    def test_config(self):
        self.config_tester.run_common_tests()

    @unittest.skip(reason="Data2VecVision does not use inputs_embeds")
    def test_inputs_embeds(self):
        # Data2VecVision does not use inputs_embeds
        pass

    def test_model_common_attributes(self):
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            model = model_class(config)
            self.assertIsInstance(model.get_input_embeddings(),
                                  (tf.keras.layers.Layer))
            x = model.get_output_embeddings()
            self.assertTrue(x is None or isinstance(x, tf.keras.layers.Layer))

    def test_forward_signature(self):
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()

        for model_class in self.all_model_classes:
            model = model_class(config)
            signature = inspect.signature(model.call)
            # signature.parameters is an OrderedDict => so arg_names order is deterministic
            arg_names = [*signature.parameters.keys()]

            expected_arg_names = ["pixel_values"]
            self.assertListEqual(arg_names[:1], expected_arg_names)

    def test_model(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_model(*config_and_inputs)

    def test_for_image_segmentation(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_for_image_segmentation(
            *config_and_inputs)

    @unittest.skip("Test was written for TF 1.x and isn't really relevant here"
                   )
    def test_compile_tf_model(self):
        pass

    def test_attention_outputs(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
        )
        config.return_dict = True

        # in Data2VecVision, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
        image_size = (self.model_tester.image_size if isinstance(
            self.model_tester.image_size, collections.abc.Iterable) else
                      (self.model_tester.image_size,
                       self.model_tester.image_size))
        patch_size = (self.model_tester.patch_size if isinstance(
            self.model_tester.patch_size, collections.abc.Iterable) else
                      (self.model_tester.patch_size,
                       self.model_tester.patch_size))
        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] //
                                                          patch_size[0])
        seq_len = num_patches + 1
        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length",
                                     seq_len)
        encoder_key_length = getattr(self.model_tester, "key_length",
                                     encoder_seq_length)
        chunk_length = getattr(self.model_tester, "chunk_length", None)
        if chunk_length is not None and hasattr(self.model_tester,
                                                "num_hashes"):
            encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes

        for model_class in self.all_model_classes:
            inputs_dict["output_attentions"] = True
            inputs_dict["output_hidden_states"] = False
            config.return_dict = True
            model = model_class(config)
            outputs = model(**self._prepare_for_class(inputs_dict,
                                                      model_class),
                            training=False)
            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
            self.assertEqual(len(attentions),
                             self.model_tester.num_hidden_layers)

            # check that output_attentions also work using config
            del inputs_dict["output_attentions"]
            config.output_attentions = True
            model = model_class(config)
            outputs = model(**self._prepare_for_class(inputs_dict,
                                                      model_class),
                            training=False)
            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
            self.assertEqual(len(attentions),
                             self.model_tester.num_hidden_layers)

            self.assertListEqual(
                list(attentions[0].shape[-3:]),
                [
                    self.model_tester.num_attention_heads, encoder_seq_length,
                    encoder_key_length
                ],
            )
            out_len = len(outputs)

            # Check attention is always last and order is fine
            inputs_dict["output_attentions"] = True
            inputs_dict["output_hidden_states"] = True
            model = model_class(config)
            outputs = model(**self._prepare_for_class(inputs_dict,
                                                      model_class),
                            training=False)

            self.assertEqual(out_len + 1, len(outputs))

            self_attentions = outputs.attentions

            self.assertEqual(len(self_attentions),
                             self.model_tester.num_hidden_layers)
            self.assertListEqual(
                list(self_attentions[0].shape[-3:]),
                [
                    self.model_tester.num_attention_heads, encoder_seq_length,
                    encoder_key_length
                ],
            )

    def test_hidden_states_output(self):
        def check_hidden_states_output(inputs_dict, config, model_class):
            model = model_class(config)

            outputs = model(
                **self._prepare_for_class(inputs_dict, model_class))

            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states

            expected_num_layers = getattr(
                self.model_tester, "expected_num_hidden_layers",
                self.model_tester.num_hidden_layers + 1)
            self.assertEqual(len(hidden_states), expected_num_layers)

            # Data2VecVision has a different seq_length
            image_size = (self.model_tester.image_size if isinstance(
                self.model_tester.image_size, collections.abc.Iterable) else
                          (self.model_tester.image_size,
                           self.model_tester.image_size))
            patch_size = (self.model_tester.patch_size if isinstance(
                self.model_tester.patch_size, collections.abc.Iterable) else
                          (self.model_tester.patch_size,
                           self.model_tester.patch_size))
            num_patches = (image_size[1] // patch_size[1]) * (image_size[0] //
                                                              patch_size[0])
            seq_length = num_patches + 1

            self.assertListEqual(
                list(hidden_states[0].shape[-2:]),
                [seq_length, self.model_tester.hidden_size],
            )

        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
        )

        for model_class in self.all_model_classes:
            inputs_dict["output_hidden_states"] = True
            check_hidden_states_output(inputs_dict, config, model_class)

            # check that output_hidden_states also work using config
            del inputs_dict["output_hidden_states"]
            config.output_hidden_states = True

            check_hidden_states_output(inputs_dict, config, model_class)

    # Overriding this method since the base method won't be compatible with Data2VecVision.
    def test_keras_fit(self):
        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
        for model_class in self.all_model_classes:
            # Since `TFData2VecVisionModel` cannot operate with the default `fit()` method.
            if model_class.__name__ != "TFData2VecVisionModel":
                model = model_class(config)
                if getattr(model, "hf_compute_loss", None):
                    # Test that model correctly compute the loss with kwargs
                    _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit(
                    )

                    label_names = {"labels"}
                    self.assertGreater(len(label_names),
                                       0,
                                       msg="No matching label names found!")
                    labels = {
                        key: val
                        for key, val in prepared_for_class.items()
                        if key in label_names
                    }
                    inputs_minus_labels = {
                        key: val
                        for key, val in prepared_for_class.items()
                        if key not in label_names
                    }
                    self.assertGreater(len(inputs_minus_labels), 0)
                    model.compile(optimizer=tf.keras.optimizers.SGD(0.0),
                                  run_eagerly=True)

                    # Make sure the model fits without crashing regardless of where we pass the labels
                    history1 = model.fit(
                        prepared_for_class,
                        validation_data=prepared_for_class,
                        steps_per_epoch=1,
                        validation_steps=1,
                        shuffle=False,
                    )
                    val_loss1 = history1.history["val_loss"][0]
                    history2 = model.fit(
                        inputs_minus_labels,
                        labels,
                        validation_data=(inputs_minus_labels, labels),
                        steps_per_epoch=1,
                        validation_steps=1,
                        shuffle=False,
                    )
                    val_loss2 = history2.history["val_loss"][0]
                    self.assertTrue(
                        np.allclose(val_loss1, val_loss2, atol=1e-2,
                                    rtol=1e-3))

    def check_pt_tf_outputs(self,
                            tf_outputs,
                            pt_outputs,
                            model_class,
                            tol=2e-4,
                            name="outputs",
                            attributes=None):
        # We override with a slightly higher tol value, as semseg models tend to diverge a bit more
        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol,
                                    name, attributes)

    # Overriding this method since the base method won't be compatible with Data2VecVision.
    def test_loss_computation(self):
        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
        )
        for model_class in self.all_model_classes:
            # Since `TFData2VecVisionModel` won't have labels against which we
            # could compute loss.
            if model_class.__name__ != "TFData2VecVisionModel":
                model = model_class(config)
                if getattr(model, "hf_compute_loss", None):
                    # The number of elements in the loss should be the same as the number of elements in the label
                    _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit(
                    )
                    added_label = prepared_for_class[sorted(
                        list(prepared_for_class.keys() - inputs_dict.keys()),
                        reverse=True)[0]]
                    loss_size = tf.size(added_label)

                    # Test that model correctly compute the loss with kwargs
                    possible_input_names = {
                        "input_ids", "pixel_values", "input_features"
                    }
                    input_name = possible_input_names.intersection(
                        set(prepared_for_class)).pop()
                    model_input = prepared_for_class.pop(input_name)

                    loss = model(model_input, **prepared_for_class)[0]
                    self.assertEqual(loss.shape, [loss_size])

                    # Test that model correctly compute the loss with a dict
                    _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit(
                    )
                    loss = model(**prepared_for_class)[0]
                    self.assertEqual(loss.shape, [loss_size])

                    # Test that model correctly compute the loss with a tuple
                    label_keys = prepared_for_class.keys() - inputs_dict.keys()
                    signature = inspect.signature(model.call).parameters
                    signature_names = list(signature.keys())

                    # Create a dictionary holding the location of the tensors in the tuple
                    tuple_index_mapping = {0: input_name}
                    for label_key in label_keys:
                        label_key_index = signature_names.index(label_key)
                        tuple_index_mapping[label_key_index] = label_key
                    sorted_tuple_index_mapping = sorted(
                        tuple_index_mapping.items())
                    # Initialize a list with their default values, update the values and convert to a tuple
                    list_input = []

                    for name in signature_names:
                        if name != "kwargs":
                            list_input.append(signature[name].default)

                    for index, value in sorted_tuple_index_mapping:
                        list_input[index] = prepared_for_class[value]

                    tuple_input = tuple(list_input)

                    # Send to model
                    loss = model(tuple_input[:-1])[0]

                    self.assertEqual(loss.shape, [loss_size])

    def test_for_image_classification(self):
        config_and_inputs = self.model_tester.prepare_config_and_inputs()
        self.model_tester.create_and_check_for_image_classification(
            *config_and_inputs)

    @slow
    def test_model_from_pretrained(self):
        for model_name in TF_DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
            model = TFData2VecVisionModel.from_pretrained(model_name)
            self.assertIsNotNone(model)
Exemplo n.º 29
0
    def get_features(
        self,
        tokenizer,
        max_length=None,
        pad_on_left=False,
        pad_token=0,
        mask_padding_with_zero=True,
        return_tensors=None,
    ):
        """
        Convert examples in a list of ``InputFeatures``

        Args:
            tokenizer: Instance of a tokenizer that will tokenize the examples
            max_length: Maximum example length
            task: GLUE task
            label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
            output_mode: String indicating the output mode. Either ``regression`` or ``classification``
            pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
            pad_token: Padding token
            mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
                and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
                actual values)

        Returns:
            If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
            containing the task-specific features. If the input is a list of ``InputExamples``, will return
            a list of task-specific ``InputFeatures`` which can be fed to the model.

        """
        if max_length is None:
            max_length = tokenizer.max_len

        label_map = {label: i for i, label in enumerate(self.labels)}

        all_input_ids = []
        for (ex_index, example) in enumerate(self.examples):
            if ex_index % 10000 == 0:
                logger.info("Tokenizing example %d", ex_index)

            input_ids = tokenizer.encode(
                example.text_a,
                add_special_tokens=True,
                max_length=min(max_length, tokenizer.max_len),
            )
            all_input_ids.append(input_ids)

        batch_length = max(len(input_ids) for input_ids in all_input_ids)

        features = []
        for (ex_index,
             (input_ids,
              example)) in enumerate(zip(all_input_ids, self.examples)):
            if ex_index % 10000 == 0:
                logger.info("Writing example %d/%d" %
                            (ex_index, len(self.examples)))
            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            attention_mask = [1 if mask_padding_with_zero else 0
                              ] * len(input_ids)

            # Zero-pad up to the sequence length.
            padding_length = batch_length - len(input_ids)
            if pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
                attention_mask = ([0 if mask_padding_with_zero else 1] *
                                  padding_length) + attention_mask
            else:
                input_ids = input_ids + ([pad_token] * padding_length)
                attention_mask = attention_mask + (
                    [0 if mask_padding_with_zero else 1] * padding_length)

            assert len(
                input_ids
            ) == batch_length, "Error with input length {} vs {}".format(
                len(input_ids), batch_length)
            assert len(
                attention_mask
            ) == batch_length, "Error with input length {} vs {}".format(
                len(attention_mask), batch_length)

            if self.mode == "classification":
                label = label_map[example.label]
            elif self.mode == "regression":
                label = float(example.label)
            else:
                raise ValueError(self.mode)

            if ex_index < 5 and self.verbose:
                logger.info("*** Example ***")
                logger.info("guid: %s" % (example.guid))
                logger.info("input_ids: %s" %
                            " ".join([str(x) for x in input_ids]))
                logger.info("attention_mask: %s" %
                            " ".join([str(x) for x in attention_mask]))
                logger.info("label: %s (id = %d)" % (example.label, label))

            features.append(
                InputFeatures(input_ids=input_ids,
                              attention_mask=attention_mask,
                              label=label))

        if return_tensors is None:
            return features
        elif return_tensors == "tf":
            if not is_tf_available():
                raise RuntimeError(
                    "return_tensors set to 'tf' but TensorFlow 2.0 can't be imported"
                )
            import tensorflow as tf

            def gen():
                for ex in features:
                    yield ({
                        "input_ids": ex.input_ids,
                        "attention_mask": ex.attention_mask
                    }, ex.label)

            dataset = tf.data.Dataset.from_generator(
                gen,
                ({
                    "input_ids": tf.int32,
                    "attention_mask": tf.int32
                }, tf.int64),
                ({
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None])
                }, tf.TensorShape([])),
            )
            return dataset
        elif return_tensors == "pt":
            if not is_torch_available():
                raise RuntimeError(
                    "return_tensors set to 'pt' but PyTorch can't be imported")
            import torch
            from torch.utils.data import TensorDataset

            all_input_ids = torch.tensor([f.input_ids for f in features],
                                         dtype=torch.long)
            all_attention_mask = torch.tensor(
                [f.attention_mask for f in features], dtype=torch.long)
            if self.mode == "classification":
                all_labels = torch.tensor([f.label for f in features],
                                          dtype=torch.long)
            elif self.mode == "regression":
                all_labels = torch.tensor([f.label for f in features],
                                          dtype=torch.float)

            dataset = TensorDataset(all_input_ids, all_attention_mask,
                                    all_labels)
            return dataset
        else:
            raise ValueError("return_tensors should be one of 'tf' or 'pt'")
Exemplo n.º 30
0
    def get_features(
        self,
        max_seq_length,
        tokenizer,
        return_tensors,
        cls_token_at_end=False,
        cls_token="[CLS]",
        cls_token_segment_id=0,
        sep_token="[SEP]",
        sep_token_extra=False,
        pad_on_left=False,
        pad_token=0,
        pad_token_segment_id=0,
        pad_token_label_id=-100,
        sequence_a_segment_id=0,
        mask_padding_with_zero=True,
    ):
        """ Loads a data file into a list of `InputBatch`s
            `cls_token_at_end` define the location of the CLS token:
                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
            `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
        """
        label_map = {label: i for i, label in enumerate(self.labels)}

        features = []  # [[inout_ids, input_mask, segment_ids, label_id]]
        for (ex_index, example) in enumerate(self.examples):
            if ex_index % 10000 == 0:
                logger.info("Writing example %d of %d", ex_index,
                            len(self.examples))

            tokens = []
            label_ids = []
            for word, label in zip(example.text_a, example.label):
                word_tokens = tokenizer.tokenize(word)
                tokens.extend(word_tokens)
                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
                label_ids.extend(
                    [label_map[label]] + [pad_token_label_id] *
                    (len(word_tokens) - 1)
                )  # some language like german one word will be tokenized to several subwords

            # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
            special_tokens_count = 3 if sep_token_extra else 2
            if len(tokens) > max_seq_length - special_tokens_count:
                tokens = tokens[:(max_seq_length - special_tokens_count)]
                label_ids = label_ids[:(max_seq_length - special_tokens_count)]

            tokens += [sep_token]
            label_ids += [pad_token_label_id]
            if sep_token_extra:
                # roberta uses an extra separator b/w pairs of sentences
                tokens += [sep_token]
                label_ids += [pad_token_label_id]
            segment_ids = [pad_token_segment_id] * len(tokens)

            if cls_token_at_end:
                tokens += [cls_token]
                label_ids += [pad_token_label_id]
                segment_ids += [cls_token_segment_id]
            else:
                tokens = [cls_token] + tokens
                label_ids = [pad_token_label_id] + label_ids
                segment_ids = [cls_token_segment_id] + segment_ids

            input_ids = tokenizer.convert_tokens_to_ids(
                tokens)  # 输入的是列表, 返回的也是列表

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

            # Zero-pad up to the sequence length.
            padding_length = max_seq_length - len(input_ids)
            if pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
                input_mask = ([0 if mask_padding_with_zero else 1] *
                              padding_length) + input_mask
                segment_ids = ([pad_token_segment_id] *
                               padding_length) + segment_ids
                label_ids = ([pad_token_label_id] * padding_length) + label_ids
            else:
                input_ids += [pad_token] * padding_length
                input_mask += [0 if mask_padding_with_zero else 1
                               ] * padding_length
                segment_ids += [pad_token_segment_id] * padding_length
                label_ids += [pad_token_label_id] * padding_length
            # 保证我的input_id, input_mask, segment_ids, 和label_ids都是一致的
            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length
            assert len(label_ids) == max_seq_length

            if ex_index < 5:
                logger.info("*** Example ***")
                logger.info("guid: %s", example.guid)
                logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
                logger.info("input_ids: %s",
                            " ".join([str(x) for x in input_ids]))
                logger.info("input_mask: %s",
                            " ".join([str(x) for x in input_mask]))
                logger.info("segment_ids: %s",
                            " ".join([str(x) for x in segment_ids]))
                logger.info("label_ids: %s",
                            " ".join([str(x) for x in label_ids]))

            features.append(
                InputFeatures(input_ids=input_ids,
                              attention_mask=input_mask,
                              token_type_ids=segment_ids,
                              label=label_ids)
            )  # note segement_ids has 1 in the front? but we don't use it right now

        # fetures to dataset
        if return_tensors is None:
            return features
        elif return_tensors == "tf":
            if not is_tf_available():
                raise RuntimeError(
                    "return_tensors set to 'tf' but TensorFlow 2.0 can't be imported"
                )
            import tensorflow as tf

            def gen():
                for ex in features:
                    yield ({
                        "input_ids": ex.input_ids,
                        "attention_mask": ex.attention_mask
                    }, ex.label)

            dataset = tf.data.Dataset.from_generator(
                gen,
                ({
                    "input_ids": tf.int32,
                    "attention_mask": tf.int32
                }, tf.int64),
                ({
                    "input_ids": tf.TensorShape([None]),
                    "attention_mask": tf.TensorShape([None])
                }, tf.TensorShape([])),
            )
            return dataset
        elif return_tensors == "pt":
            if not is_torch_available():
                raise RuntimeError(
                    "return_tensors set to 'pt' but PyTorch can't be imported")
            import torch
            from torch.utils.data import TensorDataset

            all_input_ids = torch.tensor([f.input_ids for f in features],
                                         dtype=torch.long)
            all_attention_mask = torch.tensor(
                [f.attention_mask for f in features], dtype=torch.long)
            all_token_type_ids = torch.tensor(
                [f.token_type_ids for f in features], dtype=torch.long)
            if self.mode == "classification":
                all_labels = torch.tensor([f.label for f in features],
                                          dtype=torch.long)
            elif self.mode == "regression":
                all_labels = torch.tensor([f.label for f in features],
                                          dtype=torch.float)

            dataset = TensorDataset(all_input_ids, all_attention_mask,
                                    all_token_type_ids, all_labels)
            return dataset
        else:
            raise ValueError("return_tensors should be one of 'tf' or 'pt'")