def test_loading_with_sampler(self):
     reader = MultiTaskDatasetReader(readers={
         "a": FakeDatasetReaderA(),
         "b": FakeDatasetReaderB()
     })
     data_path = {"a": "ignored", "b": "ignored"}
     scheduler = RoundRobinScheduler(batch_size=4)
     sampler = WeightedSampler({"a": 1, "b": 2})
     loader = MultiTaskDataLoader(
         reader=reader,
         data_path=data_path,
         scheduler=scheduler,
         sampler=sampler,
         instances_per_epoch=9,
     )
     vocab = Vocabulary()
     vocab.add_tokens_to_namespace(["A", "B"], "labels")
     loader.index_with(vocab)
     iterator = iter(loader)
     batch = next(iterator)
     assert torch.all(batch["label"] == torch.IntTensor([0, 1, 0, 1]))
     batch = next(iterator)
     assert torch.all(batch["label"] == torch.IntTensor([0, 1, 1, 1]))
     batch = next(iterator)
     assert torch.all(batch["label"] == torch.IntTensor([1]))
     with pytest.raises(StopIteration):
         next(iterator)
示例#2
0
    def labeled_json_to_labeled_instances(
            self, json_dict: JsonDict) -> Dict[int, Instance]:
        seq_offset = 0
        seq_len = -1
        adhoc_vocab = Vocabulary()
        instances = {}
        for i, str_i in sorted(map((lambda x: (int(x), x)), json_dict.keys())):
            inst_obj = json_dict[str_i]
            if seq_len == -1:
                seq_len = len(inst_obj['words'])
                text_field = TextField(
                    [Token(tok['text']) for tok in inst_obj['words']], {})
                instance = Instance({'tokens': text_field})

            new_instance = instance.duplicate()

            tags_field = ConstructiveSupertagField(
                [json_to_cat(tag) for tag in inst_obj['tags']], text_field,
                [i - seq_offset])
            adhoc_vocab.add_tokens_to_namespace(tags_field.labels, 'labels')
            new_instance.add_field('tags', tags_field)
            new_instance.index_fields(adhoc_vocab)

            instances[i] = new_instance

            if i + 1 - seq_offset == seq_len:
                seq_offset += seq_len
                seq_len = -1

        return instances
示例#3
0
def extend_labels(vocab: Vocabulary, labels: List[str]):
    """Adds a list of label strings to the vocabulary

    Use this to add new labels to your vocabulary (e.g., useful for reusing the weights of an existing classifier)

    Parameters
    ----------
    vocab: `allennlp.data.Vocabulary`
    labels: `List[str]`
        A list of strings containing the labels to add to an existing vocabulary
    """
    vocab.add_tokens_to_namespace(labels, namespace=LABELS_NAMESPACE)
示例#4
0
    def test_model_loads_weights_correctly(self):
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(
            ["orange", "net", "netting", "pitcher", "catcher"], "answers")

        model_name = "epwalsh/bert-xsmall-dummy"
        model = VqaVilbert.from_huggingface_model_name(
            vocab=vocab,
            model_name=model_name,
            image_feature_dim=2048,
            image_num_hidden_layers=1,
            image_hidden_size=6,
            combined_hidden_size=10,
            pooled_output_dim=7,
            image_intermediate_size=11,
            image_attention_dropout=0.0,
            image_hidden_dropout=0.0,
            image_biattention_id=[0, 1],
            text_biattention_id=[0, 1],
            text_fixed_layer=0,
            image_fixed_layer=0,
            image_num_attention_heads=3,
            combined_num_attention_heads=2,
        )

        transformer = AutoModel.from_pretrained(model_name)

        # compare embedding parameters
        mapping = {
            val: key
            for key, val in
            model.backbone.text_embeddings._construct_default_mapping(
                transformer.embeddings, "huggingface", {}).items()
        }
        assert_equal_parameters(transformer.embeddings,
                                model.backbone.text_embeddings,
                                mapping=mapping)

        # compare encoder parameters
        mapping = {
            val: key
            for key, val in model.backbone.encoder._construct_default_mapping(
                transformer.encoder, "huggingface", {}).items()
        }

        # We ignore the new parameters for the second modality, since they won't be present
        # in the huggingface model.
        assert_equal_parameters(transformer.encoder,
                                model.backbone.encoder,
                                ignore_missing=True,
                                mapping=mapping)
    def test_text_to_instance_with_basic_tokenizer_and_indexer(self):
        reader = NextTokenLmReader()

        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(["This", "is", "a"], "tokens")

        instance = reader.text_to_instance(sentence="This is a", target="This")
        assert [t.text for t in instance["tokens"]] == ["This", "is", "a"]
        assert [t.text for t in instance["target_ids"]] == ["This"]

        instance.index_fields(vocab)
        tensor_dict = instance.as_tensor_dict(instance.get_padding_lengths())
        assert tensor_dict.keys() == {"tokens", "target_ids"}
        assert tensor_dict["tokens"]["tokens"].numpy().tolist() == [2, 3, 4]
        assert tensor_dict["target_ids"]["tokens"].numpy().tolist() == [2]
示例#6
0
    def test_text_to_instance_with_basic_tokenizer_and_indexer(self):
        reader = NextTokenLmReader()

        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(['This', 'is', 'a'], 'tokens')

        instance = reader.text_to_instance(sentence='This is a', target='This')
        assert [t.text for t in instance['tokens']] == ['This', 'is', 'a']
        assert [t.text for t in instance['target_ids']] == ['This']

        instance.index_fields(vocab)
        tensor_dict = instance.as_tensor_dict(instance.get_padding_lengths())
        assert tensor_dict.keys() == {'tokens', 'target_ids'}
        assert tensor_dict['tokens']['tokens'].numpy().tolist() == [2, 3, 4]
        assert tensor_dict['target_ids']['tokens'].numpy().tolist() == [2]
示例#7
0
    def test_text_to_instance_with_basic_tokenizer_and_indexer(self):
        reader = MaskedLanguageModelingReader()

        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(["This", "is", "a", "[MASK]", "token", "."], "tokens")

        instance = reader.text_to_instance(sentence="This is a [MASK] token .", targets=["This"])
        assert [t.text for t in instance["tokens"]] == ["This", "is", "a", "[MASK]", "token", "."]
        assert [i.sequence_index for i in instance["mask_positions"]] == [3]
        assert [t.text for t in instance["target_ids"]] == ["This"]

        instance.index_fields(vocab)
        tensor_dict = instance.as_tensor_dict(instance.get_padding_lengths())
        assert tensor_dict.keys() == {"tokens", "mask_positions", "target_ids"}
        assert tensor_dict["tokens"]["tokens"].numpy().tolist() == [2, 3, 4, 5, 6, 7]
        assert tensor_dict["target_ids"]["tokens"].numpy().tolist() == [2]
        assert tensor_dict["mask_positions"].numpy().tolist() == [[3]]
示例#8
0
    def test_model_loads_weights_correctly(self):
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(
            ["orange", "net", "netting", "pitcher", "catcher"], "answers")

        model_name = "epwalsh/bert-xsmall-dummy"
        model = VqaVilbert.from_huggingface_model_name(
            vocab=vocab,
            model_name=model_name,
            image_feature_dim=2048,
            image_num_hidden_layers=1,
            image_hidden_size=6,
            combined_hidden_size=10,
            pooled_output_dim=7,
            image_intermediate_size=11,
            image_attention_dropout=0.0,
            image_hidden_dropout=0.0,
            image_biattention_id=[0, 1],
            text_biattention_id=[0, 1],
            text_fixed_layer=0,
            image_fixed_layer=0,
            image_num_attention_heads=3,
            combined_num_attention_heads=2,
        )

        transformer = AutoModel.from_pretrained(model_name)

        # compare embedding parameters
        assert_allclose(
            transformer.embeddings.word_embeddings.weight.data,
            model.backbone.text_embeddings.embeddings.word_embeddings.weight.
            data,
        )

        # compare encoder parameters
        assert_allclose(
            transformer.encoder.layer[0].intermediate.dense.weight.data,
            model.backbone.encoder.layers1[0].intermediate.dense.weight.data,
        )
def build_vocab_fixed_labels(labels: list, instances: Iterable[Instance]) -> Vocabulary:
    logger.critical("Building the vocabulary")
    logger.critical("Initializing the labels namespace")
    vocab = Vocabulary()
    indexes = vocab.add_tokens_to_namespace(labels, namespace="labels")
    logger.critical(f"Mapped them\n{labels}\n{indexes}")
    logger.critical("Initializing the regular namespace")
    vocab.extend_from_instances(instances)

    second_indexes = [vocab.get_token_index(token, namespace="labels") for token in labels]
    # indexes = vocab.add_tokens_to_namespace(labels, namespace="labels")
    logger.critical(f"Mapped them\n{labels}\n{second_indexes}")
    return vocab
    def test_text_to_instance_with_basic_tokenizer_and_indexer(self):
        reader = MaskedLanguageModelingReader()

        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(
            ['This', 'is', 'a', '[MASK]', 'token', '.'], 'tokens')

        instance = reader.text_to_instance(sentence='This is a [MASK] token .',
                                           targets=['This'])
        assert [t.text for t in instance['tokens']
                ] == ['This', 'is', 'a', '[MASK]', 'token', '.']
        assert [i.sequence_index for i in instance['mask_positions']] == [3]
        assert [t.text for t in instance['target_ids']] == ['This']

        instance.index_fields(vocab)
        tensor_dict = instance.as_tensor_dict(instance.get_padding_lengths())
        assert tensor_dict.keys() == {'tokens', 'mask_positions', 'target_ids'}
        assert tensor_dict['tokens']['tokens'].numpy().tolist() == [
            2, 3, 4, 5, 6, 7
        ]
        assert tensor_dict['target_ids']['tokens'].numpy().tolist() == [2]
        assert tensor_dict['mask_positions'].numpy().tolist() == [[3]]
示例#11
0
    def test_read(self):
        from allennlp_models.vision.dataset_readers.visual_entailment import VisualEntailmentReader

        reader = VisualEntailmentReader(
            image_dir=FIXTURES_ROOT / "vision" / "images" /
            "visual_entailment",
            image_loader=TorchImageLoader(),
            image_featurizer=Lazy(NullGridEmbedder),
            region_detector=Lazy(RandomRegionDetector),
            tokenizer=WhitespaceTokenizer(),
            token_indexers={"tokens": SingleIdTokenIndexer()},
        )
        instances = list(
            reader.read(
                "test_fixtures/vision/visual_entailment/sample_pairs.jsonl"))
        assert len(instances) == 16

        instance = instances[0]
        assert len(instance.fields) == 5
        assert len(instance["hypothesis"]) == 4
        sentence_tokens = [t.text for t in instance["hypothesis"]]
        assert sentence_tokens == ["A", "toddler", "sleeps", "outside."]
        assert instance["labels"].label == "contradiction"

        batch = Batch(instances)
        vocab = Vocabulary()
        vocab.add_tokens_to_namespace(
            ["entailment", "contradiction", "neutral"], "labels")
        batch.index_instances(vocab)
        tensors = batch.as_tensor_dict()

        # (batch size, num boxes (fake), num features (fake))
        assert tensors["box_features"].size() == (16, 2, 10)

        # (batch size, num boxes (fake), 4 coords)
        assert tensors["box_coordinates"].size() == (16, 2, 4)

        # (batch_size, num boxes (fake),)
        assert tensors["box_mask"].size() == (16, 2)
示例#12
0
    def predictions_to_labeled_instances(
        self, instances: Iterable[Instance],
        outputs: Dict[str, Union[numpy.ndarray, torch.Tensor,
                                 Iterable[Union[str, Category]]]]
    ) -> List[Instance]:
        predicted_tags = outputs['tags']
        predicted_probs = outputs['probs']

        adhoc_vocab = Vocabulary()
        new_instances = []

        cr = CategoryReader()
        gen = self._model.wrapped_model.generators[0]

        for instance, tags, probs in zip(instances, predicted_tags,
                                         predicted_probs):
            text_field: TextField = instance['tokens']
            length = text_field.sequence_length()
            for i in range(length):
                new_instance = instance.duplicate()

                if all(map((lambda x: isinstance(x, Category)), tags)):
                    cat = tags[i:i + 1]
                elif all(map((lambda x: isinstance(x, str)), tags)):
                    cat = [cr.read(tag) for tag in tags[i:i + 1]]
                else:
                    cat = gen.extract_outputs(
                        numpy.expand_dims(tags[i:i + 1], 0))[0]

                tags_field = ConstructiveSupertagField(cat, text_field, [i])
                adhoc_vocab.add_tokens_to_namespace(tags_field.labels,
                                                    'labels')
                new_instance.add_field('tags', tags_field)
                new_instance.add_field('probs', ArrayField(probs[i:i + 1]))
                new_instance.index_fields(adhoc_vocab)
                new_instances.append(new_instance)

        return new_instances
from allennlp.data import Vocabulary
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
import torch

# This is what gets created by TextField.as_tensor with a SingleIdTokenIndexer;
# see the exercises above.
token_tensor = {"tokens": {"tokens": torch.LongTensor([1, 3, 2, 1, 4, 3])}}

vocab = Vocabulary()
vocab.add_tokens_to_namespace(["This", "is", "some", "text", "."],
                              namespace="token_vocab")

glove_file = "https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.50d.txt.gz"

# This is for embedding each token.
embedding = Embedding(
    vocab=vocab,
    vocab_namespace="token_vocab",
    embedding_dim=50,
    pretrained_file=glove_file,
)

embedder = BasicTextFieldEmbedder(token_embedders={"tokens": embedding})

embedded_tokens = embedder(token_tensor)
print(embedded_tokens.size())
示例#14
0
import json
import argparse

from allennlp.data import Vocabulary

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--ontology-path', type=str, required=True)
    parser.add_argument('--output-path', type=str, required=True)
    args = parser.parse_args()

    with open(args.ontology_path) as f:
        ontology = json.load(f)

    vocab = Vocabulary()
    vocab.add_token_to_namespace(token='None', namespace='span_labels')
    vocab.add_token_to_namespace(token='@@PADDING@@', namespace='span_labels')
    vocab.add_tokens_to_namespace(tokens=list(ontology['args'].keys()),
                                  namespace='span_labels')
    vocab.add_tokens_to_namespace(tokens=list(ontology['events'].keys()),
                                  namespace='event_labels')
    vocab.save_to_files(args.output_path)
示例#15
0
from allennlp.data.fields import TextField
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer
from allennlp.data.tokenizers import WordTokenizer, CharacterTokenizer
from allennlp.data import Vocabulary

# Splits text into words (instead of wordpieces or characters).
tokenizer = WordTokenizer()

# Represents each token with a single id from a vocabulary.
token_indexer = SingleIdTokenIndexer(namespace='token_vocab')

vocab = Vocabulary()
vocab.add_tokens_to_namespace(['This', 'is', 'some', 'text', '.'],
                              namespace='token_vocab')

text = "This is some text."
tokens = tokenizer.tokenize(text)
print(tokens)

text_field = TextField(tokens, {'tokens': token_indexer})

# In order to convert the token strings into integer ids, we need to tell the
# TextField what Vocabulary to use.
text_field.index(vocab)

# We typically batch things together when making tensors, which requires some
# padding computation.  Don't worry too much about the padding for now.
padding_lengths = text_field.get_padding_lengths()

tensor_dict = text_field.as_tensor(padding_lengths)
print(tensor_dict)
def evaluate_transformers_checkpoint(
    data_path: str,
    model_config_path: str,
    checkpoint_model_name: str,
    checkpoint_tokenizer_name: str,
    batch_size: int,
    cuda_device: int,
    result_save_path: str,
):
    """
    Expected results for ``test.json`` from the Open Entity dataset:
    {'micro_precision': 0.7997806072235107, 'micro_recall': 0.7657563090324402, 'micro_fscore': 0.7823987007141113}.

    Parameters
    ----------
    data_path : str
        Data path to the input file.
    model_config_path : str
        A config file that defines the model architecture to evaluate.
    checkpoint_model_name : str
        The name of the checkpoint in Hugging Face Model Hub.
    checkpoint_tokenizer_name : str
        This should be the name of the base pre-training model because sometimes
        the tokenizer of downstream task is not compatible with allennlp.
    batch_size : int
    cuda_device : int
    result_save_path : str
    """
    import_module_and_submodules("examples_allennlp")

    tokenizer_kwargs = {"additional_special_tokens": [ENT]}
    reader = EntityTypingReader(
        tokenizer=PretrainedTransformerTokenizer(
            model_name=checkpoint_tokenizer_name,
            add_special_tokens=True,
            tokenizer_kwargs=tokenizer_kwargs),
        token_indexers={
            "tokens":
            PretrainedTransformerIndexer(model_name=checkpoint_tokenizer_name,
                                         tokenizer_kwargs=tokenizer_kwargs)
        },
        use_entity_feature=True,
    )

    transformers_tokenizer = LukeTokenizer.from_pretrained(
        checkpoint_model_name)
    transformers_model = LukeForEntityClassification.from_pretrained(
        checkpoint_model_name)

    vocab = Vocabulary()
    vocab.add_transformer_vocab(transformers_tokenizer, "tokens")
    num_labels = len(transformers_model.config.id2label)
    labels = [transformers_model.config.id2label[i] for i in range(num_labels)]
    vocab.add_tokens_to_namespace(labels, namespace="labels")

    # read model
    params = Params.from_file(
        model_config_path,
        ext_vars={"TRANSFORMERS_MODEL_NAME": checkpoint_model_name})
    model = Model.from_params(params, vocab=vocab)
    model.classifier = transformers_model.classifier
    model.eval()

    # set the GPU device to use
    if cuda_device < 0:
        device = torch.device("cpu")
    else:
        device = torch.device(f"cuda:{cuda_device}")
    model = model.to(device)

    loader = MultiProcessDataLoader(reader,
                                    data_path,
                                    batch_size=batch_size,
                                    shuffle=False)
    loader.index_with(model.vocab)
    with torch.no_grad():
        for batch in tqdm.tqdm(loader):
            batch = nn_util.move_to_device(batch, device)
            output_dict = model(**batch)

    metrics = model.get_metrics(reset=True)
    print(metrics)
    if result_save_path is not None:
        with open(result_save_path, "w") as f:
            json.dump(metrics, f)
示例#17
0
from allennlp.data.fields import TextField
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer
from allennlp.data.tokenizers import WordTokenizer, CharacterTokenizer
from allennlp.data import Vocabulary

# Splits text into words (instead of wordpieces or characters).
tokenizer = WordTokenizer()

# Represents each token with both an id from a vocabulary and a sequence of characters.
token_indexers = {
    'tokens': SingleIdTokenIndexer(namespace='token_vocab'),
    'token_characters': TokenCharactersIndexer(namespace='character_vocab')
}

vocab = Vocabulary()
vocab.add_tokens_to_namespace(['This', 'is', 'some', 'text', '.'],
                              namespace='token_vocab')
vocab.add_tokens_to_namespace(
    ['T', 'h', 'i', 's', ' ', 'o', 'm', 'e', 't', 'x', '.'],
    namespace='character_vocab')

text = "This is some text."
tokens = tokenizer.tokenize(text)
print(tokens)

text_field = TextField(tokens, {'tokens': token_indexer})

# In order to convert the token strings into integer ids, we need to tell the
# TextField what Vocabulary to use.
text_field.index(vocab)

# We typically batch things together when making tensors, which requires some
示例#18
0
def evaluate_transformers_checkpoint(
    data_path: str,
    model_config_path: str,
    checkpoint_model_name: str,
    checkpoint_tokenizer_name: str,
    batch_size: int,
    cuda_device: int,
    result_save_path: str,
    prediction_save_path: str,
):
    """
    Expected results for CoNLL-2003 NER English test set.
    {'f1': 0.9461946902654867, 'precision': 0.945859872611465, 'recall': 0.9465297450424929}

    Parameters
    ----------
    data_path : str
        Data path to the input file.
    model_config_path : str
        A config file that defines the model architecture to evaluate.
    checkpoint_model_name : str
        The name of the checkpoint in Hugging Face Model Hub.
    checkpoint_tokenizer_name : str
        This should be the name of the base pre-training model because sometimes
        the tokenizer of downstream task is not compatible with allennlp.
    batch_size : int
    cuda_device : int
    result_save_path : str
    """
    import_module_and_submodules("examples_allennlp")

    reader = ConllSpanReader(
        tokenizer=PretrainedTransformerTokenizer(
            model_name=checkpoint_tokenizer_name,
            add_special_tokens=False,
            tokenizer_kwargs={"add_prefix_space": True}),
        token_indexers={
            "tokens":
            PretrainedTransformerIndexer(model_name=checkpoint_tokenizer_name)
        },
        use_entity_feature=True,
    )

    transformers_tokenizer = LukeTokenizer.from_pretrained(
        checkpoint_model_name)
    transformers_model = LukeForEntitySpanClassification.from_pretrained(
        checkpoint_model_name)

    vocab = Vocabulary()
    vocab.add_transformer_vocab(transformers_tokenizer, "tokens")
    num_labels = len(transformers_model.config.id2label)
    labels = [transformers_model.config.id2label[i] for i in range(num_labels)]
    labels = ["O" if l == "NIL" else l for l in labels]
    vocab.add_tokens_to_namespace(labels, namespace="labels")

    # read model
    params = Params.from_file(
        model_config_path,
        ext_vars={"TRANSFORMERS_MODEL_NAME": checkpoint_model_name})
    if prediction_save_path is not None:
        params["prediction_save_path"] = prediction_save_path
    model = Model.from_params(params, vocab=vocab)
    model.classifier = transformers_model.classifier
    model.eval()

    # set the GPU device to use
    if cuda_device < 0:
        device = torch.device("cpu")
    else:
        device = torch.device(f"cuda:{cuda_device}")
    model = model.to(device)

    loader = MultiProcessDataLoader(reader,
                                    data_path,
                                    batch_size=batch_size,
                                    shuffle=False)
    loader.index_with(model.vocab)
    with torch.no_grad():
        for batch in tqdm.tqdm(loader):
            batch = nn_util.move_to_device(batch, device)
            output_dict = model(**batch)

    metrics = model.get_metrics(reset=True)
    print(metrics)
    if result_save_path is not None:
        with open(result_save_path, "w") as f:
            json.dump(metrics, f)