def tag_to_span(self, batch_tags, batch: dict): spans = [] if 'custom_words' in batch: if self.config.tagging_scheme == 'BMES': S = 'S' M = 'M' E = 'E' else: S = 'B' M = 'I' E = 'I' for tags, subwords, custom_words in zip( batch_tags, batch['token_subtoken_offsets'], batch['custom_words']): assert len(tags) == len(subwords) # [batch['raw_token'][0][x[0]:x[1]] for x in subwords] if custom_words: for start, end, label in custom_words: if end - start == 1: tags[start] = S else: tags[start] = 'B' tags[end - 1] = E for i in range(start + 1, end - 1): tags[i] = M if end < len(tags): tags[end] = 'B' spans.append(bmes_to_spans(tags)) else: for tags in batch_tags: spans.append(bmes_to_spans(tags)) return spans
def tag_to_span(self, batch_tags, batch: dict): spans = [] if 'custom_words' in batch: if self.config.tagging_scheme == 'BMES': S = 'S' M = 'M' E = 'E' else: S = 'B' M = 'I' E = 'I' for tags, custom_words in zip(batch_tags, batch['custom_words']): # [batch['raw_token'][0][x[0]:x[1]] for x in subwords] if custom_words: for start, end, label in custom_words: if end - start == 1: tags[start] = S else: tags[start] = 'B' tags[end - 1] = E for i in range(start + 1, end - 1): tags[i] = M if end < len(tags): tags[end] = 'B' # Check cases that a single char gets split into multiple subtokens, e.g., ‥ -> . + . offset = -1 # BERT produces 'ᄒ', '##ᅡ', '##ᆫ' for '한' and they share the same span prev_tag = None for tags, subtoken_offsets in zip(batch_tags, batch['token_subtoken_offsets']): for i, (tag, (b, e)) in enumerate(zip(tags, subtoken_offsets)): if b < offset: if prev_tag == 'S': tags[i - 1] = 'B' elif prev_tag == 'E': tags[i - 1] = 'M' tags[i] = 'M' offset = e prev_tag = tag for tags in batch_tags: spans.append(bmes_to_spans(tags)) return spans