示例#1
0
    def update_font_support_chars(self, chars_file):
        """
        Although some fonts have a specific character in the cmap, the rendered text is blank on the image.

        Parameters
        ----------
        chars_file: Path
            one char per line
        """
        white_list = [" "]

        charset = load_chars_file(chars_file)
        for font_path in self.font_paths:
            removed_chars = []
            font = self._get_font(font_path, 10)
            chars = self.font_support_chars_cache[font_path].copy()
            for c in chars & charset:
                bbox = font.getmask(c).getbbox()
                if (c not in white_list and bbox is None
                        and c in self.font_support_chars_cache[font_path]):
                    self.font_support_chars_cache[font_path].remove(c)
                    removed_chars.append(c)

            if len(removed_chars) != 0:
                if len(removed_chars) > 10:
                    logger.info(
                        f"Remove {len(removed_chars)} empty char mask from font [{font_path}]: {removed_chars[:10]}..."
                    )
                else:
                    logger.info(
                        f"Remove {len(removed_chars)} empty char mask from font [{font_path}]: {removed_chars}"
                    )

            self.font_support_chars_intersection_with_chars[font_path] = (
                self.font_support_chars_cache[font_path] & chars)
示例#2
0
    def __init__(self, cfg: "CorpusCfg"):
        super().__init__(cfg)

        self.cfg: RandCorpusCfg
        if self.cfg.chars_file is None or not self.cfg.chars_file.exists():
            raise PanicError(f"chars_file not exists: {self.cfg.chars_file}")

        self.chars = list(load_chars_file(self.cfg.chars_file))

        self.font_manager.update_font_support_chars(self.cfg.chars_file)
        if self.cfg.filter_font:
            self.font_manager.filter_font_path(self.cfg.filter_font_min_support_chars)
    def filter_by_chars(text, chars_file):
        """
        Filter chars not exist in chars file

        Args:
            text (Union[str, List[str]]): text to filter
            chars_file (Path): one char per line

        Returns:
            Union[str, List[str]]: string(s) removed chars not exist in chars file

        """
        if chars_file is None or not chars_file.exists():
            raise PanicError(f"chars_file not exists: {chars_file}")

        chars = load_chars_file(chars_file, log=True)

        logger.info("filtering text by chars...")

        total_count = 0
        filtered_count = 0

        # TODO: find a more efficient way
        filtered_chars = []
        if isinstance(text, list):
            out = []
            for t in text:
                _text = ""
                for c in t:
                    if c in chars:
                        _text += c
                    else:
                        filtered_count += 1
                        filtered_chars.append(c)
                    total_count += 1
                out.append(_text)
        else:
            out = ""
            for c in text:
                if c in chars:
                    out += c
                else:
                    filtered_count += 1
                    filtered_chars.append(c)
                total_count += 1
        logger.info(
            f"Filter {(filtered_count/total_count)*100:.2f}%({filtered_count}) chars in input text。"
            f"Unique chars({len(set(filtered_chars))}): {set(filtered_chars)}"
        )
        return out
def test_contain_two_space():
    with pytest.raises(PanicError, match="Find two space"):
        load_chars_file(DATA_DIR / "two_space.txt")
def test_contain_one_space(capsys):
    chars = load_chars_file(DATA_DIR / "one_space.txt")
    assert SPACE_CHAR in chars