def test_dict_field_can_iterator(self): from allennlp.data import Instance from allennlp.data.iterators import BasicIterator iterator = BasicIterator() iterator.index_with(self.vocab) instances = [ Instance({"candidates": DictField(self.instance1_fields)}), Instance({"candidates": DictField(self.instance2_fields)}) ] for batch in iterator(instances, num_epochs=1, shuffle=False): break expected_batch = { 'candidates': { 'candidate_entities': { 'entity': torch.tensor([[[2, 3], [1, 0]], [[2, 0], [0, 0]]]) }, 'candidate_entity_prior': torch.tensor([[[0.5000, 0.5000], [1.0000, 0.0000]], [[1.0000, 0.0000], [0.0000, 0.0000]]]), 'candidate_spans': torch.tensor([[[0, 0], [1, 2]], [[1, 1], [-1, -1]]]) } } self._check_tensors(batch['candidates'], expected_batch['candidates'])
def convert_tokens_candidates_to_fields(self, tokens_and_candidates): """ tokens_and_candidates is the return from a previous call to generate_sentence_entity_candidates. Converts the dict to a dict of fields usable with allennlp. """ fields = {} fields['tokens'] = TextField( [ Token(t, text_id=self.bert_tokenizer.vocab[t]) for t in tokens_and_candidates['tokens'] ], token_indexers=self._bert_single_id_indexer) fields['segment_ids'] = ArrayField(np.array( tokens_and_candidates['segment_ids']), dtype=np.int) all_candidates = {} for key, entity_candidates in tokens_and_candidates[ 'candidates'].items(): # pad the prior to create the array field # make a copy to avoid modifying the input candidate_entity_prior = copy.deepcopy( entity_candidates['candidate_entity_priors']) max_cands = max(len(p) for p in candidate_entity_prior) for p in candidate_entity_prior: if len(p) < max_cands: p.extend([0.0] * (max_cands - len(p))) np_prior = np.array(candidate_entity_prior) candidate_fields = { "candidate_entity_priors": ArrayField(np_prior, dtype=self.dtype), "candidate_entities": TextField([ Token(" ".join(candidate_list)) for candidate_list in entity_candidates["candidate_entities"] ], token_indexers={'ids': self._entity_indexers[key]}), "candidate_spans": ListField([ SpanField(span[0], span[1], fields['tokens']) for span in entity_candidates['candidate_spans'] ]), "candidate_segment_ids": ArrayField(np.array( entity_candidates['candidate_segment_ids']), dtype=np.int) } all_candidates[key] = DictField(candidate_fields) fields["candidates"] = DictField(all_candidates) return fields
def test_list_field_of_dict_field(self): from allennlp.data import Instance from allennlp.data.iterators import BasicIterator tokens3 = "The long sentence .".split() tokens3_field = TextField( [Token(t) for t in tokens3], token_indexers={'tokens': SingleIdTokenIndexer()}) instance3_fields = { "candidate_entities": TextField([ Token("entity1 entity2 entity3"), Token("entity_unk"), Token("entity2 entity3") ], token_indexers=self.entity_indexer), "candidate_entity_prior": ArrayField( np.array([[0.1, 0.1, 0.8], [1.0, 0.0, 0.0], [0.33, 0.67, 0.0]])), "candidate_spans": ListField([ SpanField(1, 1, tokens3_field), SpanField(1, 2, tokens3_field), SpanField(1, 3, tokens3_field) ], ) } iterator = BasicIterator() iterator.index_with(self.vocab) instances = [ Instance({ "candidates": ListField([ DictField(self.instance1_fields), DictField(self.instance2_fields) ]) }), Instance({ "candidates": ListField([ DictField(self.instance1_fields), DictField(instance3_fields) ]) }) ] for batch in iterator(instances, num_epochs=1, shuffle=False): pass self.assertTrue( batch['candidates']['candidate_entities']['entity'].shape == batch['candidates']['candidate_entity_prior'].shape)
def test_get_padding_lengths(self): field = DictField(self.instance1_fields) field.index(self.vocab) lengths = field.get_padding_lengths() self.assertDictEqual( lengths, { 'candidate_entities*entity_length': 2, 'candidate_entities*num_token_characters': 2, 'candidate_entities*num_tokens': 2, 'candidate_entity_prior*dimension_0': 2, 'candidate_entity_prior*dimension_1': 2, 'candidate_spans*num_fields': 2 })
def _combine_instances(self, instance_a, instance_b, nsp_label, gold_cache): text_a = ' '.join([t.text for t in instance_a['tokens'].tokens]) text_b = ' '.join([t.text for t in instance_b['tokens'].tokens]) fields = self.tokenizer_and_masker.tokenize_candidates_mask(text_a, text_b) candidate_spans = [ [s.span_start, s.span_end] for s in fields['candidates'].field_dict[self.id_type].field_dict['candidate_spans'].field_list ] assert sorted(candidate_spans) == candidate_spans # combine the gold entities golds = [] for text in [text_a, text_b]: golds.append(gold_cache[text]) combined_golds = [] j = [-1, -1] for span in candidate_spans: i = fields['segment_ids'].array[span[0]] j[i] += 1 combined_golds.append(golds[i][j[i]]) gold_text_field = TextField( [Token(g) for g in combined_golds], token_indexers=self.entity_indexer ) fields['gold_entities'] = DictField({self.id_type: gold_text_field}) if self.use_nsp_label: fields['next_sentence_label'] = LabelField(nsp_label, skip_indexing=True) del fields['lm_label_ids'] return Instance(fields)
def test_dict_field_as_tensor(self): field = DictField(self.instance1_fields) field.index(self.vocab) tensor = field.as_tensor(field.get_padding_lengths()) expected = { 'candidate_entities': { 'entity': torch.tensor([[2, 3], [1, 0]]) }, 'candidate_entity_prior': torch.tensor([[0.5000, 0.5000], [1.0000, 0.0000]]), 'candidate_spans': torch.tensor([[0, 0], [1, 2]]) } self._check_tensors(tensor, expected)
def test_dict_field_can_handle_empty(self): field = DictField(self.instance1_fields) empty = field.empty_field() self.assertTrue(True)