def update_font_support_chars(self, chars_file): """ Although some fonts have a specific character in the cmap, the rendered text is blank on the image. Parameters ---------- chars_file: Path one char per line """ white_list = [" "] charset = load_chars_file(chars_file) for font_path in self.font_paths: removed_chars = [] font = self._get_font(font_path, 10) chars = self.font_support_chars_cache[font_path].copy() for c in chars & charset: bbox = font.getmask(c).getbbox() if (c not in white_list and bbox is None and c in self.font_support_chars_cache[font_path]): self.font_support_chars_cache[font_path].remove(c) removed_chars.append(c) if len(removed_chars) != 0: if len(removed_chars) > 10: logger.info( f"Remove {len(removed_chars)} empty char mask from font [{font_path}]: {removed_chars[:10]}..." ) else: logger.info( f"Remove {len(removed_chars)} empty char mask from font [{font_path}]: {removed_chars}" ) self.font_support_chars_intersection_with_chars[font_path] = ( self.font_support_chars_cache[font_path] & chars)
def __init__(self, cfg: "CorpusCfg"): super().__init__(cfg) self.cfg: RandCorpusCfg if self.cfg.chars_file is None or not self.cfg.chars_file.exists(): raise PanicError(f"chars_file not exists: {self.cfg.chars_file}") self.chars = list(load_chars_file(self.cfg.chars_file)) self.font_manager.update_font_support_chars(self.cfg.chars_file) if self.cfg.filter_font: self.font_manager.filter_font_path(self.cfg.filter_font_min_support_chars)
def filter_by_chars(text, chars_file): """ Filter chars not exist in chars file Args: text (Union[str, List[str]]): text to filter chars_file (Path): one char per line Returns: Union[str, List[str]]: string(s) removed chars not exist in chars file """ if chars_file is None or not chars_file.exists(): raise PanicError(f"chars_file not exists: {chars_file}") chars = load_chars_file(chars_file, log=True) logger.info("filtering text by chars...") total_count = 0 filtered_count = 0 # TODO: find a more efficient way filtered_chars = [] if isinstance(text, list): out = [] for t in text: _text = "" for c in t: if c in chars: _text += c else: filtered_count += 1 filtered_chars.append(c) total_count += 1 out.append(_text) else: out = "" for c in text: if c in chars: out += c else: filtered_count += 1 filtered_chars.append(c) total_count += 1 logger.info( f"Filter {(filtered_count/total_count)*100:.2f}%({filtered_count}) chars in input text。" f"Unique chars({len(set(filtered_chars))}): {set(filtered_chars)}" ) return out
def test_contain_two_space(): with pytest.raises(PanicError, match="Find two space"): load_chars_file(DATA_DIR / "two_space.txt")
def test_contain_one_space(capsys): chars = load_chars_file(DATA_DIR / "one_space.txt") assert SPACE_CHAR in chars