def to_begin_inside_tags(tag: Optional[str],
                         length: int) -> List[Optional[str]]:
    if not length:
        return []
    prefix, tag_value = split_tag_prefix(tag)
    if not prefix:
        return [tag] * length
    return ([add_tag_prefix(tag_value, prefix=B_TAG_PREFIX)] +
            [add_tag_prefix(tag_value, prefix=I_TAG_PREFIX)] * (length - 1))
Пример #2
0
 def append_tokenized_text_token(self, tokenized_text_token: str):
     if self.tokens:
         if not tokenized_text_token.strip():
             self.tokens[-1].whitespace = tokenized_text_token
         else:
             self.tokens[-1].whitespace = ''
     self.append(
         _to_text_token(tokenized_text_token,
                        tag=self.next_tag,
                        sub_tag=self.next_sub_tag))
     self.next_tag = add_tag_prefix(strip_tag_prefix(self.next_tag),
                                    I_TAG_PREFIX)
     self.next_sub_tag = add_tag_prefix(strip_tag_prefix(self.next_sub_tag),
                                        I_TAG_PREFIX)
Пример #3
0
 def annotate(self,
              structured_document: GrobidTrainingTeiStructuredDocument):
     # There is currently no support for more than two level tagging,
     # which would allow the a parent level to be represented.
     # As a workaround we are adding the a separate parent tag, to the tokens without tag.
     # That way those tokens will share a common parent element in the output.
     tag_level = self.config.tag_level
     all_tokens_iterable = iter_all_tokens_excluding_space(
         structured_document)
     unmatched_tags = set()
     current_group_tag = None
     for token in all_tokens_iterable:
         if tag_level:
             # if we looking at sub tags, then only consider tokens with a tag
             if not structured_document.get_tag_or_preserved_tag(token):
                 continue
         tag = structured_document.get_tag_or_preserved_tag_value(
             token, level=tag_level)
         if tag:
             current_group_tag = self.config.get_group_tag_for_tag_fn(tag)
             if not current_group_tag:
                 unmatched_tags.add(tag)
             continue
         if not current_group_tag:
             continue
         structured_document.set_tag_only(token,
                                          add_tag_prefix(
                                              current_group_tag,
                                              prefix=I_TAG_PREFIX),
                                          level=tag_level)
         LOGGER.debug('updated group token (%r): %s', current_group_tag,
                      token)
     LOGGER.debug('ignored unmatched tags: %s', unmatched_tags)
     return structured_document
Пример #4
0
def get_entity_tokens(tag: str, value: str) -> List[SimpleToken]:
    return [
        SimpleToken(token_text,
                    tag=add_tag_prefix(
                        tag,
                        prefix=B_TAG_PREFIX if index == 0 else I_TAG_PREFIX))
        for index, token_text in enumerate(get_token_texts_for_text(value))
    ]
def get_merged_begin_inside_tags_of_same_tag_value(
        tags: Optional[List[Optional[str]]]) -> List[Optional[str]]:
    if not tags:
        return []
    prefix, tag_value = split_tag_prefix(tags[0])
    if not prefix:
        return tags
    return (tags[:1] + [add_tag_prefix(tag_value, prefix=I_TAG_PREFIX)] *
            (len(tags) - 1))
 def process_sub_annotations(self,
                             structured_document: T_StructuredDocument,
                             text: SequencesText, index_range: Tuple[int,
                                                                     int],
                             sub_annotations: List[TargetAnnotation]):
     if not sub_annotations:
         return
     LOGGER.debug('processing sub annotations: %s', sub_annotations)
     tokens = list(text.iter_tokens_between(index_range))
     LOGGER.debug('sub_tokens: %s', tokens)
     sub_text = SequencesText(
         [SequenceWrapper(structured_document, tokens)])
     sub_text_str = str(sub_text).lower()
     LOGGER.debug('sub_text_str: %r', sub_text_str)
     for sub_annotation in sub_annotations:
         sub_tag_name = sub_annotation.name
         target_value = sub_annotation.value
         assert not isinstance(
             target_value, list), 'list sub annotation values not supported'
         target_value = target_value.lower()
         sub_index_ranges_iterable = iter_fuzzy_search_all_index_ranges(
             sub_text_str,
             target_value,
             threshold=self.config.threshold,
             exact_word_match_threshold=self.config.
             exact_word_match_threshold)
         for sub_index_range in sub_index_ranges_iterable:
             LOGGER.debug(
                 'sub_annotation match: sub_tag=%r, value=%r sub_index_range=%s',
                 sub_tag_name, target_value, sub_index_range)
             matching_tokens = list(
                 sub_text.iter_tokens_between(sub_index_range))
             LOGGER.debug('setting sub matching_tokens to "%s": %s',
                          sub_tag_name, matching_tokens)
             existing_matching_sub_tags = [
                 structured_document.get_sub_tag(token)
                 for token in matching_tokens
             ]
             if any(existing_matching_sub_tags):
                 LOGGER.debug('some tokens already have sub tags, skipping')
                 continue
             for index, token in enumerate(matching_tokens):
                 prefix = None
                 if self.config.use_begin_prefix:
                     prefix = B_TAG_PREFIX if index == 0 else I_TAG_PREFIX
                 full_tag = add_tag_prefix(sub_tag_name, prefix=prefix)
                 structured_document.set_sub_tag(token, full_tag)
             # accept the index range and move to next sub tag
             break
         LOGGER.debug(
             'sub_annotation match not found: sub_tag=%r, value=%r',
             sub_tag_name, target_value)
Пример #7
0
 def _preserve_current_tags(self):
     rev_tag_to_tei_path_mapping = {
         v: k
         for k, v in self._tag_to_tei_path_mapping.items()
     }
     LOGGER.debug(
         'preserving tei tags using rev_tag_to_tei_path_mapping: %s',
         rev_tag_to_tei_path_mapping)
     for line in self._lines:
         for token in line.tokens:
             for level in (None, SUB_LEVEL):
                 full_existing_tag = self.get_tag(token, level=level)
                 prefix, existing_tag = split_tag_prefix(full_existing_tag)
                 mapped_tag = add_tag_prefix(
                     rev_tag_to_tei_path_mapping.get(
                         existing_tag, existing_tag),
                     prefix=prefix)
                 self._set_preserved_tag(token, mapped_tag, level=level)
 def update_annotation_for_index_range(
         self, structured_document: T_StructuredDocument,
         text: SequencesText, index_range: Tuple[int, int], tag_name: str):
     matching_tokens = list(text.iter_tokens_between(index_range))
     LOGGER.debug('setting matching_tokens to "%s": [%s]', tag_name,
                  matching_tokens)
     LOGGER.debug('setting matching text to "%s": [%s]', tag_name,
                  join_tokens_text(matching_tokens))
     untagged_matching_tokens = [
         token for token in matching_tokens
         if not structured_document.get_tag(token)
     ]
     for index, token in enumerate(untagged_matching_tokens):
         prefix = None
         if self.config.use_begin_prefix:
             prefix = B_TAG_PREFIX if index == 0 else I_TAG_PREFIX
         full_tag = add_tag_prefix(tag_name, prefix=prefix)
         if self.config.preserve_sub_annotations:
             structured_document.set_tag_only(token, full_tag)
         else:
             structured_document.set_tag(token, full_tag)
Пример #9
0
    TargetAnnotation)

from sciencebeam_trainer_grobid_tools.annotation.simple_matching_annotator import (
    SimpleTagConfig, SimpleMatchingAnnotator, get_extended_line_token_tags,
    get_simple_tag_config_map, select_index_ranges, DEFAULT_MERGE_ENABLED,
    DEFAULT_EXTEND_TO_LINE_ENABLED, DEFAULT_MAX_CHUNKS)

from tests.test_utils import log_on_exception

LOGGER = logging.getLogger(__name__)

TAG1 = 'tag1'
TAG2 = 'tag2'
TAG3 = 'tag3'

B_TAG1 = add_tag_prefix(TAG1, prefix=B_TAG_PREFIX)
I_TAG1 = add_tag_prefix(TAG1, prefix=I_TAG_PREFIX)

B_TAG2 = add_tag_prefix(TAG2, prefix=B_TAG_PREFIX)
I_TAG2 = add_tag_prefix(TAG2, prefix=I_TAG_PREFIX)


def _get_tags_of_tokens(tokens, **kwargs):
    return [t.get_tag(**kwargs) for t in tokens]


def _get_tag_values_of_tokens(tokens, **kwargs):
    return [
        strip_tag_prefix(tag) for tag in _get_tags_of_tokens(tokens, **kwargs)
    ]
def _map_tag(tag: str, tag_map: Dict[str, str]) -> str:
    prefix, tag_value = split_tag_prefix(tag)
    return add_tag_prefix(
        tag=tag_map.get(tag_value, tag_value) if tag_value else tag_value,
        prefix=prefix
    )
def to_inside_tag(tag: Optional[str]) -> Optional[str]:
    prefix, tag_value = split_tag_prefix(tag)
    return (add_tag_prefix(tag_value, prefix=I_TAG_PREFIX)
            if prefix == B_TAG_PREFIX else tag)
def to_begin_tag(tag: str) -> str:
    prefix, tag_value = split_tag_prefix(tag)
    return (add_tag_prefix(tag_value, prefix=B_TAG_PREFIX)
            if prefix == I_TAG_PREFIX else tag)
Пример #13
0
 def set_next_sub_tag(self, tag: Optional[str], begin_tag: bool = True):
     self.next_sub_tag = add_tag_prefix(
         tag, prefix=B_TAG_PREFIX if begin_tag else I_TAG_PREFIX)