def set_tag(self, parent, tag, scope=None, level: Optional[T_Tag_Level] = None): _previous_tag = self.get_tag_or_preserved_tag(parent, level=level) self.set_tag_only(parent, tag, scope=scope, level=level) if isinstance(parent, TeiSpace): return if strip_tag_prefix(tag) != strip_tag_prefix(_previous_tag): self._clear_same_preserved_tag_on_same_line(parent, level=level) if level is None: self._clear_same_preserved_tag_on_same_line(parent, level=SUB_LEVEL)
def append_tokenized_text_token(self, tokenized_text_token: str): if self.tokens: if not tokenized_text_token.strip(): self.tokens[-1].whitespace = tokenized_text_token else: self.tokens[-1].whitespace = '' self.append( _to_text_token(tokenized_text_token, tag=self.next_tag, sub_tag=self.next_sub_tag)) self.next_tag = add_tag_prefix(strip_tag_prefix(self.next_tag), I_TAG_PREFIX) self.next_sub_tag = add_tag_prefix(strip_tag_prefix(self.next_sub_tag), I_TAG_PREFIX)
def get_etal_mapped_tags( token_tags: List[str], etal_sub_tag: str, etal_merge_enabled_sub_tags: Set[str]) -> List[str]: grouped_token_tags = [ list(group) for _, group in groupby(token_tags, key=strip_tag_prefix) ] LOGGER.debug('grouped_token_tags: %s', grouped_token_tags) result = [] previous_accepted_group_sub_tag = None for group in grouped_token_tags: group_tag = group[0] group_tag_value = strip_tag_prefix(group_tag) if group_tag_value != etal_sub_tag or not previous_accepted_group_sub_tag: result.extend(group) if group_tag_value in etal_merge_enabled_sub_tags: previous_accepted_group_sub_tag = group_tag elif group_tag: previous_accepted_group_sub_tag = None continue result.append(previous_accepted_group_sub_tag) result.extend( [to_inside_tag(previous_accepted_group_sub_tag)] * (len(group) - 1) ) return result
def _clear_same_preserved_tag_on_same_line(self, token, level: T_Tag_Level = None): preserved_tag_attrib_name = get_scoped_attrib_name( PRESERVED_TAG_ATTRIB_NAME, level=level) preserved_tag = strip_tag_prefix( token.attrib.get(preserved_tag_attrib_name)) if not preserved_tag: return line_tokens = token.line.tokens get_logger().debug('clearing tokens on same line: %s (%s)', preserved_tag, line_tokens) for line_token in line_tokens: if strip_tag_prefix( line_token.attrib.get( preserved_tag_attrib_name)) == preserved_tag: self._set_preserved_tag(line_token, None, level=level)
def get_suffix_extended_token_tags( token_tags: List[str], token_texts: List[str], enabled_tags: Set[str], token_whitespaces: Optional[List[str]] = None) -> List[Optional[str]]: result: List[Optional[str]] = [] if token_whitespaces is None: token_whitespaces = [' '] * len(token_texts) grouped_token_tags: List[List[Tuple[str, str, Optional[str]]]] = [ list(group) for _, group in groupby( zip(token_tags, token_texts, token_whitespaces), key=lambda pair: strip_tag_prefix(pair[0]) ) ] LOGGER.debug('suffix grouped_token_tags=%s', grouped_token_tags) for index, group in enumerate(grouped_token_tags): LOGGER.debug('suffix group: unpacked=%s', group) group_tags: List[str] group_texts: List[str] group_whitespaces: Optional[List[str]] group_tags, group_texts, group_whitespaces = zip(*group) # type: ignore LOGGER.debug( 'suffix group: tags=%s, texts=%s, whitespace=%s', group_tags, group_texts, group_whitespaces ) first_group_tag = group_tags[0] prev_group = grouped_token_tags[index - 1] if index > 0 else None first_prev_tag: Optional[str] = get_safe(get_safe(prev_group, 0), 0) _, first_prev_tag_value = split_tag_prefix(first_prev_tag) if first_group_tag or first_prev_tag_value not in enabled_tags: result.extend(group_tags) continue joined_text = JoinedText(group_texts, sep=' ', whitespace_list=group_whitespaces) m = re.search(r'^\.', str(joined_text)) LOGGER.debug('suffix match: %s', m) if not m: result.extend(group_tags) continue LOGGER.debug('suffix match end: %s (%r)', m.end(), str(joined_text)[:m.end()]) matching_tokens = list(joined_text.iter_items_and_index_range_between( (0, m.end()) )) LOGGER.debug('suffix matching_tokens: %s', matching_tokens) if not matching_tokens: result.extend(group_tags) continue unmatched_token_count = len(group_tags) - len(matching_tokens) result.extend([to_inside_tag(first_prev_tag)] * len(matching_tokens)) result.extend([None] * unmatched_token_count) LOGGER.debug('suffix result: %s', result) return result
def _get_tag_values_of_tokens(tokens, **kwargs): return [ strip_tag_prefix(tag) for tag in _get_tags_of_tokens(tokens, **kwargs) ]
def get_prefix_extended_token_tags( token_tags: List[str], token_texts: List[str], prefix_regex_by_tag_map: Dict[str, str], token_whitespaces: List[str] = None, enabled_tags: Set[str] = None) -> List[Optional[str]]: result: List[Optional[str]] = [] if token_whitespaces is None: token_whitespaces = [' '] * len(token_texts) _enabled_tags = ( enabled_tags if enabled_tags is not None else prefix_regex_by_tag_map.keys() ) grouped_token_tags: List[List[Tuple[Optional[str], str, Optional[str]]]] = [ list(group) for _, group in groupby( zip(token_tags, token_texts, token_whitespaces), key=lambda pair: strip_tag_prefix(pair[0]) ) ] LOGGER.debug('grouped_token_tags=%s', grouped_token_tags) for index, group in enumerate(grouped_token_tags): LOGGER.debug('group: unpacked=%s', group) group_tags: List[str] group_texts: List[str] group_whitespaces: Optional[List[str]] group_tags, group_texts, group_whitespaces = zip(*group) # type: ignore LOGGER.debug( 'group: tags=%s, texts=%s, whitespace=%s', group_tags, group_texts, group_whitespaces ) first_group_tag = group_tags[0] next_group = grouped_token_tags[index + 1] if index + 1 < len(grouped_token_tags) else None first_next_tag = get_safe(get_safe(next_group, 0), 0) first_next_prefix, first_next_tag_value = split_tag_prefix(first_next_tag) if first_group_tag or first_next_tag_value not in _enabled_tags: result.extend(group_tags) continue assert first_next_tag_value is not None joined_text = JoinedText(group_texts, sep=' ', whitespace_list=group_whitespaces) prefix_regex = prefix_regex_by_tag_map[first_next_tag_value] m = re.search(prefix_regex, str(joined_text)) LOGGER.debug('m: %s', m) if not m: result.extend(group_tags) continue LOGGER.debug('start: %s (%r)', m.start(), str(joined_text)[m.start():]) matching_tokens = list(joined_text.iter_items_and_index_range_between( (m.start(), len(str(joined_text))) )) LOGGER.debug('matching_tokens: %s', matching_tokens) if not matching_tokens: result.extend(group_tags) continue unmatched_token_count = len(group_tags) - len(matching_tokens) result.extend([None] * unmatched_token_count) result.extend([first_next_tag]) result.extend([to_inside_tag(first_next_tag)] * (len(matching_tokens) - 1)) if first_next_prefix == B_TAG_PREFIX: assert next_group is not None next_group[0] = ( to_inside_tag(first_next_tag), *next_group[0][1:] ) LOGGER.debug('result: %s', result) return result
def get_extended_line_token_tags( line_token_tags: Sequence[Optional[str]], extend_to_line_enabled_map: Dict[str, bool] = None, merge_enabled_map: Dict[str, bool] = None, default_extend_to_line_enabled: bool = DEFAULT_EXTEND_TO_LINE_ENABLED, default_merge_enabled: bool = DEFAULT_MERGE_ENABLED ) -> List[Optional[str]]: if extend_to_line_enabled_map is None: extend_to_line_enabled_map = {} if merge_enabled_map is None: merge_enabled_map = {} LOGGER.debug( 'line_token_tags: %s (extend_to_line_enabled_map: %s, merge_enabled_map: %s)', line_token_tags, extend_to_line_enabled_map, merge_enabled_map) grouped_token_tags: List[List[Optional[str]]] = [ list(group) for _, group in groupby(line_token_tags, key=strip_tag_prefix) ] grouped_token_tags = [ cast(List[Optional[str]], (get_merged_begin_inside_tags_of_same_tag_value(group) if merge_enabled_map.get(strip_tag_prefix(group[0]), default_merge_enabled) else group)) for group in grouped_token_tags ] LOGGER.debug('grouped_token_tags: %s', grouped_token_tags) result: List[Optional[str]] = [] for index, group in enumerate(grouped_token_tags): prev_group = grouped_token_tags[index - 1] if index > 0 else None next_group = grouped_token_tags[ index + 1] if index + 1 < len(grouped_token_tags) else None _, last_prev_tag_value = split_tag_prefix(get_safe(prev_group, -1)) first_next_prefix, first_next_tag_value = split_tag_prefix( get_safe(next_group, 0)) LOGGER.debug('group: %s', group) if group[0]: result.extend(group) elif prev_group and next_group: if (last_prev_tag_value == first_next_tag_value and get_dict_safe(merge_enabled_map, last_prev_tag_value, default_merge_enabled)): result.extend([to_inside_tag(prev_group[-1])] * len(group)) if first_next_prefix == B_TAG_PREFIX: next_group[0] = to_inside_tag(next_group[0]) else: result.extend(group) elif (prev_group and not get_dict_safe( extend_to_line_enabled_map, last_prev_tag_value, default_extend_to_line_enabled)): result.extend(group) elif (next_group and not get_dict_safe( extend_to_line_enabled_map, first_next_tag_value, default_extend_to_line_enabled)): result.extend(group) elif prev_group and len(prev_group) > len(group): result.extend([to_inside_tag(prev_group[-1])] * len(group)) elif next_group and len(next_group) > len(group): result.extend(to_begin_inside_tags(next_group[0], len(group))) if first_next_prefix == B_TAG_PREFIX: next_group[0] = to_inside_tag(next_group[0]) else: result.extend(group) LOGGER.debug('result: %s', result) return result
def get_tag_or_preserved_tag_value(self, *args, **kwargs): return strip_tag_prefix(self.get_tag_or_preserved_tag(*args, **kwargs))