def core(s): if r'\\$' in s: raise ValueError(r'内容中含有\\$,请先跑「refine_formula」加上空格') i = 'inner' if inner else 0 # 线上才要考虑转义情况,线下也有可能\\后面跟$是不用处理的 li1 = [ m.span(i) for m in re.finditer(r'(?<!\\)(\$\$?)(?P<inner>.*?)(?<!\\)\1', s, flags=re.DOTALL) ] li2 = [ m.span(i) for m in re.finditer( r'\$\s*\\begin{array}\s*(?P<inner>.*?)\s*\\end{array}\s*\$', s, flags=re.DOTALL) ] return Intervals(li1) + Intervals(li2)
def nest(self, func, invert=False): """ 对每个子区间进行一层嵌套定位 :param func: 输入一个函数,模式为 func(s) 支持输入一个字符串,返回一个"区间集like"对象 :param invert: 是否对最终的结果再做一次取反 :return: 返回一个新的NestEnv对象 注意所有的定位功能,基本都要基于这个模式开发。 因为不是要对self.s整串匹配,而是要嵌套处理,只处理self.intervals标记的区间。 """ li = [] for reg in self.intervals: left, right = reg.start(), reg.end() t = self.s[left:right] res = Intervals(func(t)) if invert: res = res.invert(len(t)) li.extend(res + left) return type(self)(self.s, Intervals(li))
def inside(self, head, tail=None): r""" 1、匹配标记里 >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').inside(r'\ce{').replace('x') '01x01x' """ li = [] for reg in self.intervals: left, right = reg.start(), reg.end() li.extend(substr_intervals(self.s[left:right], head, tail) + left) return NestEnv(self.s, Intervals(li))
def outside(self, head, tail=None): r""" 2、匹配标记外 >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').outside(r'\ce{').replace(lambda s: 'x') 'x\\ce{H2O\\ce{2}}x\\ce{1\\ce{3}5}' """ li = [] for reg in self.intervals: left, right = reg.start(), reg.end() li.extend( substr_intervals(self.s[left:right], head, tail, invert=True) + left) return NestEnv(self.s, Intervals(li))
def __sub__(self, other): """ 区间集减法运算 >>> s = 'aa$b$ccc$dd$eee' >>> (NestEnv(s).find2('$', '$') - NestEnv(s).inside('a', 'd')).strings() ['d$'] >>> (NestEnv(s).find2('$', '$') - re.finditer(r'a.*?d', s)).strings() ['d$'] """ if isinstance(other, Intervals): return NestEnv(self.s, self.intervals - other) elif isinstance(other, NestEnv): if self.s != other.s: raise ValueError('两个NestEnv的主文本内容不相同,子区间集不能相减') return NestEnv(self.s, self.intervals - other.intervals) else: # 其他一律转Intervals对象处理 return NestEnv(self.s, self.intervals - Intervals(other))
def expand(self, ne): r""" 在现有区间上,判断是否有被其他区间包含,有则进行延展 可以输入head、tail配对规则,也可以输入现成的区间 >>> ne = LatexNestEnv(r'aa$cc\ce{a}dd$bb\ce{d}h$h$') >>> ne.latexcmd1(r'ce').expand(ne.formula()).strings() ['$cc\\ce{a}dd$', '\\ce{d}'] TODO 扩展临接也能延展的功能? """ if isinstance(ne, NestEnv): b = ne.intervals elif isinstance(ne, Intervals): b = ne else: raise TypeError c = self.intervals + Intervals([x for x in b if (self.intervals & x)]) return NestEnv(self.s, c)
def __and__(self, other): r""" 区间集求并运算 >>> s = 'aa$b$ccc$dd$eee' >>> (NestEnv(s).find2('$', '$') & NestEnv(s).inside('a', 'd')).strings() ['$b$', '$d'] >>> (NestEnv(s).find2('$', '$') & re.finditer(r'a.*?d', s)).strings() ['$b$', '$d'] """ if isinstance(other, Intervals): return NestEnv(self.s, self.intervals & other) elif isinstance(other, NestEnv): if self.s != other.s: # 两个不是同个文本内容的话是不能合并的 raise ValueError('两个NestEnv的主文本内容不相同,不能求子区间集的交') return NestEnv(self.s, self.intervals & other.intervals) else: # 其他一律转Intervals对象处理 # raise TypeError(rf'NestEnv不能和{type(other)}类型做区间集交运算') return NestEnv(self.s, self.intervals & Intervals(other))
def inner(self, head, tail=None): r""" 0、匹配标记里,不含head、tail标记 >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').inner(r'\ce{').inner(r'\ce{').replace('x') '01\\ce{H2O\\ce{x}}01\\ce{1\\ce{x}5}' >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').inner(r'\cc{').string() >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').outside(r'\cc{').string() '01\\ce{H2O\\ce{2}}01\\ce{1\\ce{3}5}' TODO 注意 topic、analysis 这类定位 该函数目前还不支持,会有bug TODO 0的标记其实不好,不方便功能组合,1和2是互斥的,但是0和2不是互斥的,是可以组合的,即范围外含标签的内容,4会更合适,但现在改也挺别扭的,就先记录着,以后再看 """ li = [] for reg in self.intervals: left, right = reg.start(), reg.end() li.extend( substr_intervals(self.s[left:right], head, tail, inner=True) + left) return NestEnv(self.s, Intervals(li))
def __init__(self, s, intervals=None): self.s = s if intervals is None: intervals = Intervals([[0, len(s)]]) self.intervals = Intervals(intervals)
class __NestEnvBase: __slots__ = ('s', 'intervals') def __init__(self, s, intervals=None): self.s = s if intervals is None: intervals = Intervals([[0, len(s)]]) self.intervals = Intervals(intervals) def inner(self, head, tail=None): r""" 0、匹配标记里,不含head、tail标记 >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').inner(r'\ce{').inner(r'\ce{').replace('x') '01\\ce{H2O\\ce{x}}01\\ce{1\\ce{x}5}' >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').inner(r'\cc{').string() >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').outside(r'\cc{').string() '01\\ce{H2O\\ce{2}}01\\ce{1\\ce{3}5}' TODO 注意 topic、analysis 这类定位 该函数目前还不支持,会有bug TODO 0的标记其实不好,不方便功能组合,1和2是互斥的,但是0和2不是互斥的,是可以组合的,即范围外含标签的内容,4会更合适,但现在改也挺别扭的,就先记录着,以后再看 """ li = [] for reg in self.intervals: left, right = reg.start(), reg.end() li.extend( substr_intervals(self.s[left:right], head, tail, inner=True) + left) return NestEnv(self.s, Intervals(li)) def inside(self, head, tail=None): r""" 1、匹配标记里 >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').inside(r'\ce{').replace('x') '01x01x' """ li = [] for reg in self.intervals: left, right = reg.start(), reg.end() li.extend(substr_intervals(self.s[left:right], head, tail) + left) return NestEnv(self.s, Intervals(li)) def outside(self, head, tail=None): r""" 2、匹配标记外 >>> NestEnv(r'01\ce{H2O\ce{2}}01\ce{1\ce{3}5}').outside(r'\ce{').replace(lambda s: 'x') 'x\\ce{H2O\\ce{2}}x\\ce{1\\ce{3}5}' """ li = [] for reg in self.intervals: left, right = reg.start(), reg.end() li.extend( substr_intervals(self.s[left:right], head, tail, invert=True) + left) return NestEnv(self.s, Intervals(li)) def expand(self, ne): r""" 在现有区间上,判断是否有被其他区间包含,有则进行延展 可以输入head、tail配对规则,也可以输入现成的区间 >>> ne = LatexNestEnv(r'aa$cc\ce{a}dd$bb\ce{d}h$h$') >>> ne.latexcmd1(r'ce').expand(ne.formula()).strings() ['$cc\\ce{a}dd$', '\\ce{d}'] TODO 扩展临接也能延展的功能? """ if isinstance(ne, NestEnv): b = ne.intervals elif isinstance(ne, Intervals): b = ne else: raise TypeError c = self.intervals + Intervals([x for x in b if (self.intervals & x)]) return NestEnv(self.s, c) def filter(self, func): r""" 传入一个自定义函数func,会将每个区间的s传入,只保留func(s)为True的区间 >>> LatexNestEnv('aa$bbb$ccc$d$eee$fff$g').formula().filter(lambda s: len(s) > 4).strings() ['$bbb$', '$fff$'] """ li = list( filter(lambda x: func(self.s[x.start():x.end()]), self.intervals)) return NestEnv(self.s, li) def _parse_tags(self, tags): if not isinstance(tags[0], (list, tuple)): # 旧单维数组输入,要先转成二维结构 n = len(tags) // 3 assert n and n * 3 == len(tags) tags = [tags[3 * i:3 * (i + 1)] for i in range(n)] return tags # def any(self, tags): # r""" 区间集求并 # # :param tags: 同nestenv的tags参数规则 # # >>> NestEnv(r'12$34$56\ce{78}90').any(['$', '$', 1, r'\ce{', '}', 1]).replace(lambda s: 'x') # '12x56x90' # """ # tags, li = self._parse_tags(tags), [] # for tag in tags: # head, tail, t = tag # for reg in self.intervals: # left, right = reg.start(), reg.end() # li.extend(substr_intervals(self.s[left:right], head, tail, invert=(t == 2), inner=(t == 0)) + left) # return NestEnv(self.s, Intervals(li)) # def all(self, tags): # r""" 区间集求交 # # :param tags: 同nestenv的tags参数规则 # # # 删除即在公式里,也在ce里的内容 # >>> NestEnv(r'12$34$56\ce{78$x$}90').all([r'\ce{', '}', 1, '$', '$', 1]).replace(lambda s: '') # '12$34$56\\ce{78}90' # # >>> NestEnv(r'12$34$56\ce{78$x$}90').all(['$', '$', 1, r'\ce{', '}', 1]).replace(lambda s: '') # '12$34$56\\ce{78}90' # """ # tags, intervals = self._parse_tags(tags), self.intervals # for tag in tags: # head, tail, t = tag # li = [] # for reg in self.intervals: # left, right = reg.start(), reg.end() # li.extend(substr_intervals(self.s[left:right], head, tail, invert=(t == 2), inner=(t == 0)) + left) # intervals &= Intervals(li) # return NestEnv(self.s, intervals) def __repr__(self): """不在定位范围内的非换行字符,全部替换为空格""" t = self.intervals.replace(self.s, lambda s: s, out_repl=lambda s: re.sub(r'[^\n]', ' ', s)) return t def __bool__(self): """NestEnv类的布尔逻辑由区间集的逻辑确定""" return bool(self.intervals) def string(self, idx=0): """第一个区间匹配的值 >>> NestEnv('11a22b33a44bcc').find2('a', 'b').string() 'a22b' """ if self.intervals and idx < len(self.intervals): r = self.intervals.li[idx] return self.s[r.start():r.end()] else: return None def strings(self): """所有区间匹配的值""" if self.intervals: return [self.s[r.start():r.end()] for r in self.intervals] else: return [] def startlines(self, unique=False): r""" 每个匹配到的区间处于原内容s的第几行 >>> NestEnv('{}\naa\n{}\n{}{}a\nb').inside('{', '}').startlines() [1, 3, 4, 4] """ if not self.intervals: return [] # 1 辅助变量 linepos = [m.start() for m in re.finditer(r'\n', self.s)] n = len(self.s) if n and (not linepos or linepos[-1] != n): linepos.append(n) # 2 每个子区间起始行号 lines = [ bisect.bisect_right(linepos, x.start() - 1) + 1 for x in self.intervals ] if unique: lines = sorted(set(lines)) return lines def group(self, idx=0): """第一个匹配区间,以match格式返回""" if self.intervals and idx < len(self.intervals): r = self.intervals.li[idx] return ReMatch(r.regs, self.s, 0, len(self.s)) else: return None def groups(self): """所有匹配区间,以match格式返回""" if self.intervals: return [ ReMatch(r.regs, self.s, 0, len(self.s)) for r in self.intervals ] else: return [] # TODO def gettag、settag、gettags、settags 特殊的inside操作 # TODO def getattr、setattr、getattrs、setattrs def sub(self, infunc=lambda m: m.group(), *, outfunc=lambda m: m.group(), adjacent=False) -> str: """类似re.sub正则模式的替换""" return self.intervals.sub(self.s, infunc, out_repl=outfunc, adjacent=adjacent) def replace(self, arg1, arg2=None, *, outfunc=lambda s: s, adjacent=False) -> str: """ 类似字符串replace模式的替换 arg1可以输入自定义替换函数,也可以像str.replace(arg1, arg2)这样传入参数 """ return self.intervals.replace(self.s, arg1, arg2, out_repl=outfunc, adjacent=adjacent) def __invert__(self): r""" >>> (~NestEnv('aa$b$cc').find2('$', '$')).strings() ['aa', 'cc'] """ return NestEnv(self.s, self.intervals.invert(len(self.s))) def invert(self): r""" >>> NestEnv('aa$b$cc').find2('$', '$').invert().strings() ['aa', 'cc'] """ return ~self def __and__(self, other): r""" 区间集求并运算 >>> s = 'aa$b$ccc$dd$eee' >>> (NestEnv(s).find2('$', '$') & NestEnv(s).inside('a', 'd')).strings() ['$b$', '$d'] >>> (NestEnv(s).find2('$', '$') & re.finditer(r'a.*?d', s)).strings() ['$b$', '$d'] """ if isinstance(other, Intervals): return NestEnv(self.s, self.intervals & other) elif isinstance(other, NestEnv): if self.s != other.s: # 两个不是同个文本内容的话是不能合并的 raise ValueError('两个NestEnv的主文本内容不相同,不能求子区间集的交') return NestEnv(self.s, self.intervals & other.intervals) else: # 其他一律转Intervals对象处理 # raise TypeError(rf'NestEnv不能和{type(other)}类型做区间集交运算') return NestEnv(self.s, self.intervals & Intervals(other)) def __or__(self, other): """ 区间集相加运算 >>> s = 'aa$b$ccc$dd$eee' >>> (NestEnv(s).find2('$', '$') | NestEnv(s).inside('a', 'd')).strings() ['aa$b$ccc$dd$'] >>> (NestEnv(s).find2('$', '$') | re.finditer(r'a.*?d', s)).strings() ['aa$b$ccc$dd$'] """ if isinstance(other, Intervals): return NestEnv(self.s, self.intervals | other) elif isinstance(other, NestEnv): if self.s != other.s: raise ValueError('两个NestEnv的主文本内容不相同,不能求子区间集的并') return NestEnv(self.s, self.intervals | other.intervals) else: # 其他一律转Intervals对象处理 return NestEnv(self.s, self.intervals | Intervals(other)) def __add__(self, other): return self | other def __sub__(self, other): """ 区间集减法运算 >>> s = 'aa$b$ccc$dd$eee' >>> (NestEnv(s).find2('$', '$') - NestEnv(s).inside('a', 'd')).strings() ['d$'] >>> (NestEnv(s).find2('$', '$') - re.finditer(r'a.*?d', s)).strings() ['d$'] """ if isinstance(other, Intervals): return NestEnv(self.s, self.intervals - other) elif isinstance(other, NestEnv): if self.s != other.s: raise ValueError('两个NestEnv的主文本内容不相同,子区间集不能相减') return NestEnv(self.s, self.intervals - other.intervals) else: # 其他一律转Intervals对象处理 return NestEnv(self.s, self.intervals - Intervals(other)) def nest(self, func, invert=False): """ 对每个子区间进行一层嵌套定位 :param func: 输入一个函数,模式为 func(s) 支持输入一个字符串,返回一个"区间集like"对象 :param invert: 是否对最终的结果再做一次取反 :return: 返回一个新的NestEnv对象 注意所有的定位功能,基本都要基于这个模式开发。 因为不是要对self.s整串匹配,而是要嵌套处理,只处理self.intervals标记的区间。 """ li = [] for reg in self.intervals: left, right = reg.start(), reg.end() t = self.s[left:right] res = Intervals(func(t)) if invert: res = res.invert(len(t)) li.extend(res + left) return type(self)(self.s, Intervals(li))
def substr_intervals(s, head, tail=None, invert=False, inner=False): """ 旧模块,不推荐使用,建议使用新版的NestEnv接口直接处理 :param s: 内容 :param head: 头 TODO 含正则和不含正则的,可以分子函数来实现,不要都写在这个函数 :param tail: 尾 TODO 支持普通字符串和正则对象的头尾搭配 :param invert: 是否取反 :param inner: TODO 注意目前很多匹配功能还不支持inner模式 False,定位内部时,含标签 True,不含标签 :return: TODO 考虑tabular嵌套tabular这种的正常定位? TODO 支持同时定位topic和sub_topic? """ def infer_headtail(head, tail=None): """输入简化的head、tail命令,返回智能推导出的完整的head、tail值""" if isinstance(head, str) and tail is None: if re.match(r'\$+$', head): # 公式 tail = head elif re.match(r'\\(chapter|section|subsection){', head): pass # 这种情况 tail 不用改,就是用 None 来代表不确定性结尾标记 elif head[-1] in '[{(<': # 配对括号 tail = {'[': ']', '{': '}', '(': ')', '<': '>'}[head[-1]] elif head.startswith('%<'): tail = '%/' elif head[0] == '<': tail = 'xmltag' elif re.match(r'\\begin{[a-zA-Z]+}', head): # latex类的环境匹配 m = re.match(r'\\begin({[a-zA-Z]+})', head) tail = r'\end' + m.group(1) else: # 没有推导出来 tail = None return head, tail head, tail = infer_headtail(head, tail) pos1, parts = 0, [] # 1 括号匹配:head最后一个字符和tail第一个字符是匹配括号 # TODO 其实可以考虑tail的匹配括号不在头尾而在内容中间的情况 if head[-1] in '[{(<' and tail and len(tail) and tail[0] == ']})>'[ '[{(<'.index(head[-1])]: parts = NestEnv(s).bracket(head, tail, inner).intervals # 2 第2种括号匹配: head第一个字符与tail最后一个字符是匹配括号 elif head[0] in '[{(<' and tail and len(tail) and tail[-1] == ']})>'[ '[{(<'.index(head[0])]: parts = NestEnv(s).bracket2(head, tail, inner).intervals # 3 公式匹配 elif head == tail == '$': parts = LatexNestEnv(s).formula(inner).intervals # 4 百分注结构 %<xxx a='yy'> ... %</xxx> 的格式匹配 elif re.match(r'%<[a-zA-Z\-_]+', head) and tail == '%/': parts = LatexNestEnv(s).pxmltag(head[2:], 'inner').intervals # 5 latex的 章、节、子节 匹配 elif re.match(r'\\(chapter|section|subsection)', head) and not tail: # TODO 支持inner功能 parts = LatexNestEnv(s).latexpart(head[1:], inner=inner) elif head == r'\item': parts = LatexNestEnv(s).item().intervals # 7 latex类的环境匹配 elif re.match(r'\\begin{([a-zA-Z]+)}', head): m1 = re.match(r'\\begin{([a-zA-Z]+)}', head) m2 = re.match(r'\\end{([a-zA-Z]+)}', tail) if m2 and m1.group(1) == m2.group(1): parts = LatexNestEnv(s).latexenv(head, tail, inner).intervals else: parts = LatexNestEnv(s).find2(head, tail, inner).intervals # 8 抓取latex中所有插图命令 elif head == r'\includegraphics' and tail is None: parts = LatexNestEnv(s).includegraphics('inner').intervals # 9 lewis电子式匹配 elif head == r'\lewis' and tail is None: parts = LatexNestEnv(s).lewis(inner=inner).intervals # 10 xml标签结点匹配 elif head[0] == '<' and tail == 'xmltag': parts = NestEnv(s).xmltag(head[1:], inner).intervals # +、普通匹配 elif isinstance(head, str) and isinstance(tail, str): parts = NestEnv(s).find2(head, tail, inner).intervals elif isinstance(head, str) and not isinstance(tail, str): parts = NestEnv(s).find(head).intervals t = Intervals(parts) if invert: t = t.invert(len(s)) return t
def split_vector_interval(vec, maxsplit=None, minwidth=3): """ :param vec: 一个一维向量,需要对这个向量进行切割 需要前置工作先处理好数值 使得背景在非正数,背景概率越大,负值绝对值越大 前景在正值,前景概率越大,数值越大 要得到能量最大(数值最大、前景内容)的几个区域 但是因为有噪声的原因,该算法要有一定的抗干扰能力 一般情况下 用 0 代表背景 用 <1 的正值表示这一列黑点所占比例(np.mean) 用 np.sum 传入整数暂时也行,但考虑以后功能扩展性,用比例会更好 传入负数,表示特殊背景,该背景可以抵消掉的minwidth宽度数 :param maxsplit: 最大切分数量,即最多得到几个子区间 没设置的时候,会对所有满足条件的情况进行切割 :param minwidth: 每个切分位置最小具有的宽度 :return: [(l, r), (l, r), ...] 每一段文本的左右区间 """ # 1 裁剪左边、右边 n_vec = len(vec) left, right = 0, n_vec while left < right and vec[left] <= 0: left += 1 while right > left and vec[right - 1] <= 0: right -= 1 # 左右空白至少也要达到minwidth才去除 # if left < minwidth: left = 0 # if n_vec - right + 1 < minwidth: right = n_vec vec = vec[left:right] width = len(vec) if width == 0: return [] # 没有内容,返回空list # 2 找切分位置 # 统计每一段连续的背景长度,并且对其数值求和,作为这段是背景的置信度 bg_probs, bg_start, cnt = [], 0, 0 def update_fg(): """ 遇到前景内容,或者循环结束,更新一下 """ nonlocal cnt prob = vec[bg_start:bg_start + cnt].sum() # print(cnt, prob) if cnt >= (minwidth + prob): # 负值可以减小minwidth限定 itv = [bg_start, bg_start + cnt] bg_probs.append([itv, prob]) cnt = 0 for i in range(width): if vec[i] <= 0: if not cnt: bg_start = i cnt += 1 else: update_fg() else: update_fg() # 3 取置信度最大的几个分割点 if maxsplit: bg_probs = sorted(bg_probs, key=lambda x: x[1])[:(maxsplit - 1)] bg_probs = sorted(bg_probs, key=lambda x: x[0]) # 从左到右排序 # 4 返回文本区间(反向计算) res = [] intervals = Intervals([itv for itv, prob in bg_probs]).invert(width) + left # print(intervals) for interval in intervals: res.append([interval.start(), interval.end()]) return res