Пример #1
0
 def tag_to_span(self, batch_tags, batch: dict):
     spans = []
     if 'custom_words' in batch:
         if self.config.tagging_scheme == 'BMES':
             S = 'S'
             M = 'M'
             E = 'E'
         else:
             S = 'B'
             M = 'I'
             E = 'I'
         for tags, subwords, custom_words in zip(
                 batch_tags, batch['token_subtoken_offsets'],
                 batch['custom_words']):
             assert len(tags) == len(subwords)
             # [batch['raw_token'][0][x[0]:x[1]] for x in subwords]
             if custom_words:
                 for start, end, label in custom_words:
                     if end - start == 1:
                         tags[start] = S
                     else:
                         tags[start] = 'B'
                         tags[end - 1] = E
                         for i in range(start + 1, end - 1):
                             tags[i] = M
                     if end < len(tags):
                         tags[end] = 'B'
             spans.append(bmes_to_spans(tags))
     else:
         for tags in batch_tags:
             spans.append(bmes_to_spans(tags))
     return spans
Пример #2
0
 def tag_to_span(self, batch_tags, batch: dict):
     spans = []
     if 'custom_words' in batch:
         if self.config.tagging_scheme == 'BMES':
             S = 'S'
             M = 'M'
             E = 'E'
         else:
             S = 'B'
             M = 'I'
             E = 'I'
         for tags, custom_words in zip(batch_tags, batch['custom_words']):
             # [batch['raw_token'][0][x[0]:x[1]] for x in subwords]
             if custom_words:
                 for start, end, label in custom_words:
                     if end - start == 1:
                         tags[start] = S
                     else:
                         tags[start] = 'B'
                         tags[end - 1] = E
                         for i in range(start + 1, end - 1):
                             tags[i] = M
                     if end < len(tags):
                         tags[end] = 'B'
     # Check cases that a single char gets split into multiple subtokens, e.g., ‥ -> . + .
     offset = -1  # BERT produces 'ᄒ', '##ᅡ', '##ᆫ' for '한' and they share the same span
     prev_tag = None
     for tags, subtoken_offsets in zip(batch_tags,
                                       batch['token_subtoken_offsets']):
         for i, (tag, (b, e)) in enumerate(zip(tags, subtoken_offsets)):
             if b < offset:
                 if prev_tag == 'S':
                     tags[i - 1] = 'B'
                 elif prev_tag == 'E':
                     tags[i - 1] = 'M'
                 tags[i] = 'M'
             offset = e
             prev_tag = tag
     for tags in batch_tags:
         spans.append(bmes_to_spans(tags))
     return spans