예제 #1
0
 def core(s):
     if r'\\$' in s:
         raise ValueError(r'内容中含有\\$,请先跑「refine_formula」加上空格')
     i = 'inner' if inner else 0
     # 线上才要考虑转义情况,线下也有可能\\后面跟$是不用处理的
     li1 = [
         m.span(i)
         for m in re.finditer(r'(?<!\\)(\$\$?)(?P<inner>.*?)(?<!\\)\1',
                              s,
                              flags=re.DOTALL)
     ]
     li2 = [
         m.span(i) for m in re.finditer(
             r'\$\s*\\begin{array}\s*(?P<inner>.*?)\s*\\end{array}\s*\$',
             s,
             flags=re.DOTALL)
     ]
     return Intervals(li1) + Intervals(li2)
예제 #2
0
    def nest(self, func, invert=False):
        """ 对每个子区间进行一层嵌套定位

        :param func: 输入一个函数,模式为 func(s)
            支持输入一个字符串,返回一个"区间集like"对象
        :param invert: 是否对最终的结果再做一次取反
        :return: 返回一个新的NestEnv对象

        注意所有的定位功能,基本都要基于这个模式开发。
        因为不是要对self.s整串匹配,而是要嵌套处理,只处理self.intervals标记的区间。
        """
        li = []
        for reg in self.intervals:
            left, right = reg.start(), reg.end()
            t = self.s[left:right]
            res = Intervals(func(t))
            if invert: res = res.invert(len(t))
            li.extend(res + left)
        return type(self)(self.s, Intervals(li))
예제 #3
0
    def inside(self, head, tail=None):
        r""" 1、匹配标记里

        >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').inside(r'\ce{').replace('x')
        '01x01x'
        """
        li = []
        for reg in self.intervals:
            left, right = reg.start(), reg.end()
            li.extend(substr_intervals(self.s[left:right], head, tail) + left)
        return NestEnv(self.s, Intervals(li))
예제 #4
0
    def outside(self, head, tail=None):
        r""" 2、匹配标记外

        >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').outside(r'\ce{').replace(lambda s: 'x')
        'x\\ce{H2O\\ce{2}}x\\ce{1\\ce{3}5}'
        """
        li = []
        for reg in self.intervals:
            left, right = reg.start(), reg.end()
            li.extend(
                substr_intervals(self.s[left:right], head, tail, invert=True) +
                left)
        return NestEnv(self.s, Intervals(li))
예제 #5
0
    def __sub__(self, other):
        """ 区间集减法运算

        >>> s = 'aa$b$ccc$dd$eee'
        >>> (NestEnv(s).find2('$', '$') - NestEnv(s).inside('a', 'd')).strings()
        ['d$']
        >>> (NestEnv(s).find2('$', '$') - re.finditer(r'a.*?d', s)).strings()
        ['d$']
        """
        if isinstance(other, Intervals):
            return NestEnv(self.s, self.intervals - other)
        elif isinstance(other, NestEnv):
            if self.s != other.s:
                raise ValueError('两个NestEnv的主文本内容不相同,子区间集不能相减')
            return NestEnv(self.s, self.intervals - other.intervals)
        else:  # 其他一律转Intervals对象处理
            return NestEnv(self.s, self.intervals - Intervals(other))
예제 #6
0
    def expand(self, ne):
        r""" 在现有区间上,判断是否有被其他区间包含,有则进行延展
        可以输入head、tail配对规则,也可以输入现成的区间

        >>> ne = LatexNestEnv(r'aa$cc\ce{a}dd$bb\ce{d}h$h$')
        >>> ne.latexcmd1(r'ce').expand(ne.formula()).strings()
        ['$cc\\ce{a}dd$', '\\ce{d}']

        TODO 扩展临接也能延展的功能?
        """
        if isinstance(ne, NestEnv):
            b = ne.intervals
        elif isinstance(ne, Intervals):
            b = ne
        else:
            raise TypeError
        c = self.intervals + Intervals([x for x in b if (self.intervals & x)])
        return NestEnv(self.s, c)
예제 #7
0
    def __and__(self, other):
        r""" 区间集求并运算

        >>> s = 'aa$b$ccc$dd$eee'
        >>> (NestEnv(s).find2('$', '$') & NestEnv(s).inside('a', 'd')).strings()
        ['$b$', '$d']
        >>> (NestEnv(s).find2('$', '$') & re.finditer(r'a.*?d', s)).strings()
        ['$b$', '$d']
        """
        if isinstance(other, Intervals):
            return NestEnv(self.s, self.intervals & other)
        elif isinstance(other, NestEnv):
            if self.s != other.s:  # 两个不是同个文本内容的话是不能合并的
                raise ValueError('两个NestEnv的主文本内容不相同,不能求子区间集的交')
            return NestEnv(self.s, self.intervals & other.intervals)
        else:  # 其他一律转Intervals对象处理
            # raise TypeError(rf'NestEnv不能和{type(other)}类型做区间集交运算')
            return NestEnv(self.s, self.intervals & Intervals(other))
예제 #8
0
    def inner(self, head, tail=None):
        r""" 0、匹配标记里,不含head、tail标记

        >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').inner(r'\ce{').inner(r'\ce{').replace('x')
        '01\\ce{H2O\\ce{x}}01\\ce{1\\ce{x}5}'
        >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').inner(r'\cc{').string()
        >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').outside(r'\cc{').string()
        '01\\ce{H2O\\ce{2}}01\\ce{1\\ce{3}5}'

        TODO 注意 topic、analysis 这类定位 该函数目前还不支持,会有bug
        TODO 0的标记其实不好,不方便功能组合,1和2是互斥的,但是0和2不是互斥的,是可以组合的,即范围外含标签的内容,4会更合适,但现在改也挺别扭的,就先记录着,以后再看
        """
        li = []
        for reg in self.intervals:
            left, right = reg.start(), reg.end()
            li.extend(
                substr_intervals(self.s[left:right], head, tail, inner=True) +
                left)
        return NestEnv(self.s, Intervals(li))
예제 #9
0
 def __init__(self, s, intervals=None):
     self.s = s
     if intervals is None: intervals = Intervals([[0, len(s)]])
     self.intervals = Intervals(intervals)
예제 #10
0
class __NestEnvBase:
    __slots__ = ('s', 'intervals')

    def __init__(self, s, intervals=None):
        self.s = s
        if intervals is None: intervals = Intervals([[0, len(s)]])
        self.intervals = Intervals(intervals)

    def inner(self, head, tail=None):
        r""" 0、匹配标记里,不含head、tail标记

        >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').inner(r'\ce{').inner(r'\ce{').replace('x')
        '01\\ce{H2O\\ce{x}}01\\ce{1\\ce{x}5}'
        >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').inner(r'\cc{').string()
        >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').outside(r'\cc{').string()
        '01\\ce{H2O\\ce{2}}01\\ce{1\\ce{3}5}'

        TODO 注意 topic、analysis 这类定位 该函数目前还不支持,会有bug
        TODO 0的标记其实不好,不方便功能组合,1和2是互斥的,但是0和2不是互斥的,是可以组合的,即范围外含标签的内容,4会更合适,但现在改也挺别扭的,就先记录着,以后再看
        """
        li = []
        for reg in self.intervals:
            left, right = reg.start(), reg.end()
            li.extend(
                substr_intervals(self.s[left:right], head, tail, inner=True) +
                left)
        return NestEnv(self.s, Intervals(li))

    def inside(self, head, tail=None):
        r""" 1、匹配标记里

        >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').inside(r'\ce{').replace('x')
        '01x01x'
        """
        li = []
        for reg in self.intervals:
            left, right = reg.start(), reg.end()
            li.extend(substr_intervals(self.s[left:right], head, tail) + left)
        return NestEnv(self.s, Intervals(li))

    def outside(self, head, tail=None):
        r""" 2、匹配标记外

        >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').outside(r'\ce{').replace(lambda s: 'x')
        'x\\ce{H2O\\ce{2}}x\\ce{1\\ce{3}5}'
        """
        li = []
        for reg in self.intervals:
            left, right = reg.start(), reg.end()
            li.extend(
                substr_intervals(self.s[left:right], head, tail, invert=True) +
                left)
        return NestEnv(self.s, Intervals(li))

    def expand(self, ne):
        r""" 在现有区间上,判断是否有被其他区间包含,有则进行延展
        可以输入head、tail配对规则,也可以输入现成的区间

        >>> ne = LatexNestEnv(r'aa$cc\ce{a}dd$bb\ce{d}h$h$')
        >>> ne.latexcmd1(r'ce').expand(ne.formula()).strings()
        ['$cc\\ce{a}dd$', '\\ce{d}']

        TODO 扩展临接也能延展的功能?
        """
        if isinstance(ne, NestEnv):
            b = ne.intervals
        elif isinstance(ne, Intervals):
            b = ne
        else:
            raise TypeError
        c = self.intervals + Intervals([x for x in b if (self.intervals & x)])
        return NestEnv(self.s, c)

    def filter(self, func):
        r""" 传入一个自定义函数func,会将每个区间的s传入,只保留func(s)为True的区间

        >>> LatexNestEnv('aa$bbb$ccc$d$eee$fff$g').formula().filter(lambda s: len(s) > 4).strings()
        ['$bbb$', '$fff$']
        """
        li = list(
            filter(lambda x: func(self.s[x.start():x.end()]), self.intervals))
        return NestEnv(self.s, li)

    def _parse_tags(self, tags):
        if not isinstance(tags[0], (list, tuple)):
            # 旧单维数组输入,要先转成二维结构
            n = len(tags) // 3
            assert n and n * 3 == len(tags)
            tags = [tags[3 * i:3 * (i + 1)] for i in range(n)]
        return tags

    # def any(self, tags):
    #     r""" 区间集求并
    #
    #     :param tags: 同nestenv的tags参数规则
    #
    #     >>> NestEnv(r'12$34$56\ce{78}90').any(['$', '$', 1, r'\ce{', '}', 1]).replace(lambda s: 'x')
    #     '12x56x90'
    #     """
    #     tags, li = self._parse_tags(tags), []
    #     for tag in tags:
    #         head, tail, t = tag
    #         for reg in self.intervals:
    #             left, right = reg.start(), reg.end()
    #             li.extend(substr_intervals(self.s[left:right], head, tail, invert=(t == 2), inner=(t == 0)) + left)
    #     return NestEnv(self.s, Intervals(li))

    # def all(self, tags):
    #     r""" 区间集求交
    #
    #     :param tags: 同nestenv的tags参数规则
    #
    #     # 删除即在公式里,也在ce里的内容
    #     >>> NestEnv(r'12$34$56\ce{78$x$}90').all([r'\ce{', '}', 1, '$', '$', 1]).replace(lambda s: '')
    #     '12$34$56\\ce{78}90'
    #
    #     >>> NestEnv(r'12$34$56\ce{78$x$}90').all(['$', '$', 1, r'\ce{', '}', 1]).replace(lambda s: '')
    #     '12$34$56\\ce{78}90'
    #     """
    #     tags, intervals = self._parse_tags(tags), self.intervals
    #     for tag in tags:
    #         head, tail, t = tag
    #         li = []
    #         for reg in self.intervals:
    #             left, right = reg.start(), reg.end()
    #             li.extend(substr_intervals(self.s[left:right], head, tail, invert=(t == 2), inner=(t == 0)) + left)
    #         intervals &= Intervals(li)
    #     return NestEnv(self.s, intervals)

    def __repr__(self):
        """不在定位范围内的非换行字符,全部替换为空格"""
        t = self.intervals.replace(self.s,
                                   lambda s: s,
                                   out_repl=lambda s: re.sub(r'[^\n]', ' ', s))
        return t

    def __bool__(self):
        """NestEnv类的布尔逻辑由区间集的逻辑确定"""
        return bool(self.intervals)

    def string(self, idx=0):
        """第一个区间匹配的值

        >>> NestEnv('11a22b33a44bcc').find2('a', 'b').string()
        'a22b'
        """
        if self.intervals and idx < len(self.intervals):
            r = self.intervals.li[idx]
            return self.s[r.start():r.end()]
        else:
            return None

    def strings(self):
        """所有区间匹配的值"""
        if self.intervals:
            return [self.s[r.start():r.end()] for r in self.intervals]
        else:
            return []

    def startlines(self, unique=False):
        r""" 每个匹配到的区间处于原内容s的第几行

        >>> NestEnv('{}\naa\n{}\n{}{}a\nb').inside('{', '}').startlines()
        [1, 3, 4, 4]
        """
        if not self.intervals: return []
        # 1 辅助变量
        linepos = [m.start() for m in re.finditer(r'\n', self.s)]
        n = len(self.s)
        if n and (not linepos or linepos[-1] != n): linepos.append(n)
        # 2 每个子区间起始行号
        lines = [
            bisect.bisect_right(linepos,
                                x.start() - 1) + 1 for x in self.intervals
        ]
        if unique: lines = sorted(set(lines))
        return lines

    def group(self, idx=0):
        """第一个匹配区间,以match格式返回"""
        if self.intervals and idx < len(self.intervals):
            r = self.intervals.li[idx]
            return ReMatch(r.regs, self.s, 0, len(self.s))
        else:
            return None

    def groups(self):
        """所有匹配区间,以match格式返回"""
        if self.intervals:
            return [
                ReMatch(r.regs, self.s, 0, len(self.s)) for r in self.intervals
            ]
        else:
            return []

    # TODO def gettag、settag、gettags、settags  特殊的inside操作
    # TODO def getattr、setattr、getattrs、setattrs

    def sub(self,
            infunc=lambda m: m.group(),
            *,
            outfunc=lambda m: m.group(),
            adjacent=False) -> str:
        """类似re.sub正则模式的替换"""
        return self.intervals.sub(self.s,
                                  infunc,
                                  out_repl=outfunc,
                                  adjacent=adjacent)

    def replace(self,
                arg1,
                arg2=None,
                *,
                outfunc=lambda s: s,
                adjacent=False) -> str:
        """ 类似字符串replace模式的替换

        arg1可以输入自定义替换函数,也可以像str.replace(arg1, arg2)这样传入参数
        """
        return self.intervals.replace(self.s,
                                      arg1,
                                      arg2,
                                      out_repl=outfunc,
                                      adjacent=adjacent)

    def __invert__(self):
        r"""
        >>> (~NestEnv('aa$b$cc').find2('$', '$')).strings()
        ['aa', 'cc']
        """
        return NestEnv(self.s, self.intervals.invert(len(self.s)))

    def invert(self):
        r"""
        >>> NestEnv('aa$b$cc').find2('$', '$').invert().strings()
        ['aa', 'cc']
        """
        return ~self

    def __and__(self, other):
        r""" 区间集求并运算

        >>> s = 'aa$b$ccc$dd$eee'
        >>> (NestEnv(s).find2('$', '$') & NestEnv(s).inside('a', 'd')).strings()
        ['$b$', '$d']
        >>> (NestEnv(s).find2('$', '$') & re.finditer(r'a.*?d', s)).strings()
        ['$b$', '$d']
        """
        if isinstance(other, Intervals):
            return NestEnv(self.s, self.intervals & other)
        elif isinstance(other, NestEnv):
            if self.s != other.s:  # 两个不是同个文本内容的话是不能合并的
                raise ValueError('两个NestEnv的主文本内容不相同,不能求子区间集的交')
            return NestEnv(self.s, self.intervals & other.intervals)
        else:  # 其他一律转Intervals对象处理
            # raise TypeError(rf'NestEnv不能和{type(other)}类型做区间集交运算')
            return NestEnv(self.s, self.intervals & Intervals(other))

    def __or__(self, other):
        """ 区间集相加运算

        >>> s = 'aa$b$ccc$dd$eee'
        >>> (NestEnv(s).find2('$', '$') | NestEnv(s).inside('a', 'd')).strings()
        ['aa$b$ccc$dd$']
        >>> (NestEnv(s).find2('$', '$') | re.finditer(r'a.*?d', s)).strings()
        ['aa$b$ccc$dd$']
        """
        if isinstance(other, Intervals):
            return NestEnv(self.s, self.intervals | other)
        elif isinstance(other, NestEnv):
            if self.s != other.s:
                raise ValueError('两个NestEnv的主文本内容不相同,不能求子区间集的并')
            return NestEnv(self.s, self.intervals | other.intervals)
        else:  # 其他一律转Intervals对象处理
            return NestEnv(self.s, self.intervals | Intervals(other))

    def __add__(self, other):
        return self | other

    def __sub__(self, other):
        """ 区间集减法运算

        >>> s = 'aa$b$ccc$dd$eee'
        >>> (NestEnv(s).find2('$', '$') - NestEnv(s).inside('a', 'd')).strings()
        ['d$']
        >>> (NestEnv(s).find2('$', '$') - re.finditer(r'a.*?d', s)).strings()
        ['d$']
        """
        if isinstance(other, Intervals):
            return NestEnv(self.s, self.intervals - other)
        elif isinstance(other, NestEnv):
            if self.s != other.s:
                raise ValueError('两个NestEnv的主文本内容不相同,子区间集不能相减')
            return NestEnv(self.s, self.intervals - other.intervals)
        else:  # 其他一律转Intervals对象处理
            return NestEnv(self.s, self.intervals - Intervals(other))

    def nest(self, func, invert=False):
        """ 对每个子区间进行一层嵌套定位

        :param func: 输入一个函数,模式为 func(s)
            支持输入一个字符串,返回一个"区间集like"对象
        :param invert: 是否对最终的结果再做一次取反
        :return: 返回一个新的NestEnv对象

        注意所有的定位功能,基本都要基于这个模式开发。
        因为不是要对self.s整串匹配,而是要嵌套处理,只处理self.intervals标记的区间。
        """
        li = []
        for reg in self.intervals:
            left, right = reg.start(), reg.end()
            t = self.s[left:right]
            res = Intervals(func(t))
            if invert: res = res.invert(len(t))
            li.extend(res + left)
        return type(self)(self.s, Intervals(li))
예제 #11
0
def substr_intervals(s, head, tail=None, invert=False, inner=False):
    """ 旧模块,不推荐使用,建议使用新版的NestEnv接口直接处理

    :param s: 内容
    :param head: 头
        TODO 含正则和不含正则的,可以分子函数来实现,不要都写在这个函数
    :param tail: 尾
        TODO 支持普通字符串和正则对象的头尾搭配
    :param invert: 是否取反
    :param inner:  TODO 注意目前很多匹配功能还不支持inner模式
        False,定位内部时,含标签
        True,不含标签
    :return:

    TODO 考虑tabular嵌套tabular这种的正常定位?
    TODO 支持同时定位topic和sub_topic?
    """
    def infer_headtail(head, tail=None):
        """输入简化的head、tail命令,返回智能推导出的完整的head、tail值"""
        if isinstance(head, str) and tail is None:
            if re.match(r'\$+$', head):  # 公式
                tail = head
            elif re.match(r'\\(chapter|section|subsection){', head):
                pass  # 这种情况 tail 不用改,就是用 None 来代表不确定性结尾标记
            elif head[-1] in '[{(<':  # 配对括号
                tail = {'[': ']', '{': '}', '(': ')', '<': '>'}[head[-1]]
            elif head.startswith('%<'):
                tail = '%/'
            elif head[0] == '<':
                tail = 'xmltag'
            elif re.match(r'\\begin{[a-zA-Z]+}', head):  # latex类的环境匹配
                m = re.match(r'\\begin({[a-zA-Z]+})', head)
                tail = r'\end' + m.group(1)
            else:  # 没有推导出来
                tail = None
        return head, tail

    head, tail = infer_headtail(head, tail)

    pos1, parts = 0, []
    # 1 括号匹配:head最后一个字符和tail第一个字符是匹配括号 # TODO 其实可以考虑tail的匹配括号不在头尾而在内容中间的情况
    if head[-1] in '[{(<' and tail and len(tail) and tail[0] == ']})>'[
            '[{(<'.index(head[-1])]:
        parts = NestEnv(s).bracket(head, tail, inner).intervals
    # 2 第2种括号匹配: head第一个字符与tail最后一个字符是匹配括号
    elif head[0] in '[{(<' and tail and len(tail) and tail[-1] == ']})>'[
            '[{(<'.index(head[0])]:
        parts = NestEnv(s).bracket2(head, tail, inner).intervals
    # 3 公式匹配
    elif head == tail == '$':
        parts = LatexNestEnv(s).formula(inner).intervals
    # 4 百分注结构 %<xxx a='yy'> ... %</xxx> 的格式匹配
    elif re.match(r'%<[a-zA-Z\-_]+', head) and tail == '%/':
        parts = LatexNestEnv(s).pxmltag(head[2:], 'inner').intervals
    # 5 latex的 章、节、子节 匹配
    elif re.match(r'\\(chapter|section|subsection)',
                  head) and not tail:  # TODO 支持inner功能
        parts = LatexNestEnv(s).latexpart(head[1:], inner=inner)
    elif head == r'\item':
        parts = LatexNestEnv(s).item().intervals
    # 7 latex类的环境匹配
    elif re.match(r'\\begin{([a-zA-Z]+)}', head):
        m1 = re.match(r'\\begin{([a-zA-Z]+)}', head)
        m2 = re.match(r'\\end{([a-zA-Z]+)}', tail)
        if m2 and m1.group(1) == m2.group(1):
            parts = LatexNestEnv(s).latexenv(head, tail, inner).intervals
        else:
            parts = LatexNestEnv(s).find2(head, tail, inner).intervals
    # 8 抓取latex中所有插图命令
    elif head == r'\includegraphics' and tail is None:
        parts = LatexNestEnv(s).includegraphics('inner').intervals
    # 9 lewis电子式匹配
    elif head == r'\lewis' and tail is None:
        parts = LatexNestEnv(s).lewis(inner=inner).intervals
    # 10 xml标签结点匹配
    elif head[0] == '<' and tail == 'xmltag':
        parts = NestEnv(s).xmltag(head[1:], inner).intervals
    # +、普通匹配
    elif isinstance(head, str) and isinstance(tail, str):
        parts = NestEnv(s).find2(head, tail, inner).intervals
    elif isinstance(head, str) and not isinstance(tail, str):
        parts = NestEnv(s).find(head).intervals

    t = Intervals(parts)
    if invert: t = t.invert(len(s))
    return t
예제 #12
0
파일: geo.py 프로젝트: XLPRUtils/pyxllib
def split_vector_interval(vec, maxsplit=None, minwidth=3):
    """
    :param vec: 一个一维向量,需要对这个向量进行切割
        需要前置工作先处理好数值
            使得背景在非正数,背景概率越大,负值绝对值越大
            前景在正值,前景概率越大,数值越大
        要得到能量最大(数值最大、前景内容)的几个区域
        但是因为有噪声的原因,该算法要有一定的抗干扰能力

        一般情况下
            用 0 代表背景
            用 <1 的正值表示这一列黑点所占比例(np.mean)
            用 np.sum 传入整数暂时也行,但考虑以后功能扩展性,用比例会更好
            传入负数,表示特殊背景,该背景可以抵消掉的minwidth宽度数
    :param maxsplit: 最大切分数量,即最多得到几个子区间
        没设置的时候,会对所有满足条件的情况进行切割
    :param minwidth: 每个切分位置最小具有的宽度
    :return: [(l, r), (l, r), ...]  每一段文本的左右区间
    """
    # 1 裁剪左边、右边
    n_vec = len(vec)
    left, right = 0, n_vec
    while left < right and vec[left] <= 0:
        left += 1
    while right > left and vec[right - 1] <= 0:
        right -= 1
    # 左右空白至少也要达到minwidth才去除
    # if left < minwidth: left = 0
    # if n_vec - right + 1 < minwidth: right = n_vec

    vec = vec[left:right]
    width = len(vec)
    if width == 0:
        return []  # 没有内容,返回空list

    # 2 找切分位置
    #   统计每一段连续的背景长度,并且对其数值求和,作为这段是背景的置信度
    bg_probs, bg_start, cnt = [], 0, 0

    def update_fg():
        """ 遇到前景内容,或者循环结束,更新一下 """
        nonlocal cnt
        prob = vec[bg_start:bg_start + cnt].sum()
        # print(cnt, prob)
        if cnt >= (minwidth + prob):  # 负值可以减小minwidth限定
            itv = [bg_start, bg_start + cnt]
            bg_probs.append([itv, prob])
        cnt = 0

    for i in range(width):
        if vec[i] <= 0:
            if not cnt:
                bg_start = i
            cnt += 1
        else:
            update_fg()
    else:
        update_fg()

    # 3 取置信度最大的几个分割点
    if maxsplit:
        bg_probs = sorted(bg_probs, key=lambda x: x[1])[:(maxsplit - 1)]
    bg_probs = sorted(bg_probs, key=lambda x: x[0])  # 从左到右排序

    # 4 返回文本区间(反向计算)
    res = []
    intervals = Intervals([itv for itv, prob in bg_probs]).invert(width) + left
    # print(intervals)
    for interval in intervals:
        res.append([interval.start(), interval.end()])
    return res