示例#1
0
    def test_dict_field_can_iterator(self):
        from allennlp.data import Instance
        from allennlp.data.iterators import BasicIterator

        iterator = BasicIterator()
        iterator.index_with(self.vocab)

        instances = [
            Instance({"candidates": DictField(self.instance1_fields)}),
            Instance({"candidates": DictField(self.instance2_fields)})
        ]

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            break

        expected_batch = {
            'candidates': {
                'candidate_entities': {
                    'entity': torch.tensor([[[2, 3], [1, 0]], [[2, 0], [0,
                                                                        0]]])
                },
                'candidate_entity_prior':
                torch.tensor([[[0.5000, 0.5000], [1.0000, 0.0000]],
                              [[1.0000, 0.0000], [0.0000, 0.0000]]]),
                'candidate_spans':
                torch.tensor([[[0, 0], [1, 2]], [[1, 1], [-1, -1]]])
            }
        }

        self._check_tensors(batch['candidates'], expected_batch['candidates'])
    def convert_tokens_candidates_to_fields(self, tokens_and_candidates):
        """
        tokens_and_candidates is the return from a previous call to
        generate_sentence_entity_candidates.  Converts the dict to
        a dict of fields usable with allennlp.
        """
        fields = {}

        fields['tokens'] = TextField(
            [
                Token(t, text_id=self.bert_tokenizer.vocab[t])
                for t in tokens_and_candidates['tokens']
            ],
            token_indexers=self._bert_single_id_indexer)

        fields['segment_ids'] = ArrayField(np.array(
            tokens_and_candidates['segment_ids']),
                                           dtype=np.int)

        all_candidates = {}
        for key, entity_candidates in tokens_and_candidates[
                'candidates'].items():
            # pad the prior to create the array field
            # make a copy to avoid modifying the input
            candidate_entity_prior = copy.deepcopy(
                entity_candidates['candidate_entity_priors'])
            max_cands = max(len(p) for p in candidate_entity_prior)
            for p in candidate_entity_prior:
                if len(p) < max_cands:
                    p.extend([0.0] * (max_cands - len(p)))
            np_prior = np.array(candidate_entity_prior)

            candidate_fields = {
                "candidate_entity_priors":
                ArrayField(np_prior, dtype=self.dtype),
                "candidate_entities":
                TextField([
                    Token(" ".join(candidate_list)) for candidate_list in
                    entity_candidates["candidate_entities"]
                ],
                          token_indexers={'ids': self._entity_indexers[key]}),
                "candidate_spans":
                ListField([
                    SpanField(span[0], span[1], fields['tokens'])
                    for span in entity_candidates['candidate_spans']
                ]),
                "candidate_segment_ids":
                ArrayField(np.array(
                    entity_candidates['candidate_segment_ids']),
                           dtype=np.int)
            }
            all_candidates[key] = DictField(candidate_fields)

        fields["candidates"] = DictField(all_candidates)

        return fields
示例#3
0
    def test_list_field_of_dict_field(self):
        from allennlp.data import Instance
        from allennlp.data.iterators import BasicIterator

        tokens3 = "The long sentence .".split()
        tokens3_field = TextField(
            [Token(t) for t in tokens3],
            token_indexers={'tokens': SingleIdTokenIndexer()})

        instance3_fields = {
            "candidate_entities":
            TextField([
                Token("entity1 entity2 entity3"),
                Token("entity_unk"),
                Token("entity2 entity3")
            ],
                      token_indexers=self.entity_indexer),
            "candidate_entity_prior":
            ArrayField(
                np.array([[0.1, 0.1, 0.8], [1.0, 0.0, 0.0], [0.33, 0.67,
                                                             0.0]])),
            "candidate_spans":
            ListField([
                SpanField(1, 1, tokens3_field),
                SpanField(1, 2, tokens3_field),
                SpanField(1, 3, tokens3_field)
            ], )
        }

        iterator = BasicIterator()
        iterator.index_with(self.vocab)

        instances = [
            Instance({
                "candidates":
                ListField([
                    DictField(self.instance1_fields),
                    DictField(self.instance2_fields)
                ])
            }),
            Instance({
                "candidates":
                ListField([
                    DictField(self.instance1_fields),
                    DictField(instance3_fields)
                ])
            })
        ]

        for batch in iterator(instances, num_epochs=1, shuffle=False):
            pass

        self.assertTrue(
            batch['candidates']['candidate_entities']['entity'].shape ==
            batch['candidates']['candidate_entity_prior'].shape)
示例#4
0
 def test_get_padding_lengths(self):
     field = DictField(self.instance1_fields)
     field.index(self.vocab)
     lengths = field.get_padding_lengths()
     self.assertDictEqual(
         lengths, {
             'candidate_entities*entity_length': 2,
             'candidate_entities*num_token_characters': 2,
             'candidate_entities*num_tokens': 2,
             'candidate_entity_prior*dimension_0': 2,
             'candidate_entity_prior*dimension_1': 2,
             'candidate_spans*num_fields': 2
         })
示例#5
0
    def _combine_instances(self, instance_a, instance_b, nsp_label, gold_cache):
        text_a = ' '.join([t.text for t in instance_a['tokens'].tokens])
        text_b = ' '.join([t.text for t in instance_b['tokens'].tokens])

        fields = self.tokenizer_and_masker.tokenize_candidates_mask(text_a, text_b)
        candidate_spans = [
            [s.span_start, s.span_end]
            for s in fields['candidates'].field_dict[self.id_type].field_dict['candidate_spans'].field_list
        ]
        assert sorted(candidate_spans) == candidate_spans

        # combine the gold entities
        golds = []
        for text in [text_a, text_b]:
            golds.append(gold_cache[text])

        combined_golds = []
        j = [-1, -1]
        for span in candidate_spans:
            i = fields['segment_ids'].array[span[0]]
            j[i] += 1
            combined_golds.append(golds[i][j[i]])

        gold_text_field = TextField(
            [Token(g) for g in combined_golds],
            token_indexers=self.entity_indexer
        )
        fields['gold_entities'] = DictField({self.id_type: gold_text_field})

        if self.use_nsp_label:
            fields['next_sentence_label'] = LabelField(nsp_label, skip_indexing=True)

        del fields['lm_label_ids']

        return Instance(fields)
示例#6
0
    def test_dict_field_as_tensor(self):
        field = DictField(self.instance1_fields)
        field.index(self.vocab)
        tensor = field.as_tensor(field.get_padding_lengths())

        expected = {
            'candidate_entities': {
                'entity': torch.tensor([[2, 3], [1, 0]])
            },
            'candidate_entity_prior':
            torch.tensor([[0.5000, 0.5000], [1.0000, 0.0000]]),
            'candidate_spans':
            torch.tensor([[0, 0], [1, 2]])
        }

        self._check_tensors(tensor, expected)
示例#7
0
 def test_dict_field_can_handle_empty(self):
     field = DictField(self.instance1_fields)
     empty = field.empty_field()
     self.assertTrue(True)