def escape_bp_tag(text): # 转义编排标签 text = red.sub(r'^(<(?:time|mark)>)', r'#\1', text, flags=red.MULTILINE) # 【引用开始】、【引用结束】 text = red.sub(r'【(引用(?:开始|结束)|补充回复)】', r'[\1]', text) # 标记的处理信息 if text.endswith('【与上一条回复重复】') \ or text.endswith('【无法处理的回复】'): text = text + '#' return text
def bp_to_final(infile, keep_discard=True, label=0): '''编译 编排to最终、丢弃''' class placeholder: def __init__(self, posi=0, pagenum=0, show=False): self.posi = posi self.pagenum = pagenum self.show = show def is_not_empty(lst): for i in lst: yield i.strip() != '' info_list = list() holder_list = [placeholder()] text_list = list() abandon_list = list() pickcount, allcount = 0, 0 # 用于把 [img]http://img3.laibafile.cn/p/m/1234567.jpg[/img] # 替换成 【图片:1234567.jpg】 picr = (r'\[img\s*(\d+|)\].*?\[/img\]') pattern = red.re_dict(picr) # 提取页号 re_pagenum = red.re_dict(r'^<page>页号:\s*(\d+)\s*$') # 提取时间 p_time = (r'^<time>[^<]*<\d\d(\d\d-\d{1,2}-\d{1,2})\s+' r'(\d{1,2}:\d{1,2})') re_time = red.re_dict(p_time) # 读取编排文本 in_reply = False temp = list() current_page = 0 current_time = '' for line in infile.readlines(): if line.startswith('<time>'): if in_reply == True: print('格式错误:回复文本的前后包括标志不配对。\n', '丢失<mark>行') break in_reply = True # current_time if label == 2: m = re_time.search(line) if m: current_time = m.group(1) + ' ' + m.group(2) else: current_time = '' elif line.startswith('<mark>'): if in_reply == False: print('格式错误:回复文本的前后包括标志不配对。\n', '丢失<time>行') break if line.endswith('█\n') or line.endswith('█'): pickcount += 1 if label == 0: pass elif label == 1: holder_list[-1].show = True elif label == 2: floor_label = ('№.%d ☆☆☆' ' 发表于%s P.%d ' '☆☆☆\n' '-------------------------' '-------------------------' '\n') floor_label = floor_label % \ (pickcount, current_time, current_page) text_list.append(floor_label) text_list.extend(temp) text_list.append('\n') elif any(is_not_empty(temp)): abandon_list.extend(temp) abandon_list.append('∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞\n\n') temp.clear() allcount += 1 in_reply = False elif in_reply: line = pattern.sub(r'【一张图片\1】', line) temp.append(line) # 由于上一个elif,以下必定not in_reply elif not text_list and not abandon_list and \ line.startswith('<tiezi>'): info_list.append(line[len('<tiezi>'):]) elif label != 0: m = re_pagenum.search(line) if m: current_page = int(m.group(1)) if label == 1: text_list.append('') holder = placeholder(len(text_list)-1, current_page ) holder_list.append(holder) infile.close() if in_reply == True: print('格式错误:最后一个回复文本的前后包括标志不配对。') # 页码 辅助格式 if label == 1: for holder in holder_list[1:]: if holder.show: page_label = ('☆☆☆☆☆' ' 进入第%d页 ' '☆☆☆☆☆\n' '----------------' '----------------' '\n\n') % holder.pagenum text_list[holder.posi] = page_label color_p1 = color.fore_color(allcount, color.Fore.YELLOW) color_p2 = color.fore_color(pickcount, color.Fore.YELLOW) print('共有{0}条回复,选择了其中{1}条回复'.format(color_p1, color_p2)) # output的内容============ # 连接 if info_list: s_iter = itertools.chain(info_list, '\n', text_list) else: s_iter = iter(text_list) s = ''.join(s_iter) # 连续的多张图片 s = red.sub(r'(?:【一张图片(\d+|)】\s+){3,}', r'【多张图片\1】\n\n', s) s = red.sub(r'(?:【一张图片(\d+|)】\s+){2}', r'【两张图片\1】\n\n', s) # 输出StringIO output = StringIO(s) # 汉字字数 chinese_ct = count_chinese(s) # 丢弃文本 if keep_discard and abandon_list: s_iter = itertools.chain(info_list, '\n', abandon_list) s = ''.join(s_iter) discard = StringIO(s) else: discard = None return output, discard, info_list, chinese_ct
def doit(self): # 获取、显示网址 try: u = self.master.clipboard_get().strip() except: bad = True u = '' else: bad = False if bad or not tz2txt.is_url(u): self.url.set('无效网址,网址须以http://或https://开头。') return self.url.set(u) # 辅助模式 assist = self.assist.get() if assist == 1: label = '' elif assist == 2: label = 'page' elif assist == 3: label = 'floor' # 末页 till = self.till.get().strip() try: till = int(till) except: till = -1 # 执行命令 self.status['fg'] = '#993300' self.status['text'] = '处理中' self.update() # except里return try: output, discard_output, title, info_list, chinese_ct = \ tz2txt.auto(u, till, '', '', label, from_gui=True) if title == None: raise Exception('无法完成全自动处理') except Exception as e: print('\n出现异常:', e) print('===================================\n') return else: # 显示标题 title = red.sub(r'[\U00010000-\U0010FFFF]', r'', title) title = title.strip() self.url.set(title) finally: self.status['fg'] = 'blue' self.status['text'] = '待机' # 输出文件名 if self.rename.get(): output_fn = title + '.txt' else: output_fn = self.output.get().strip() # 合法文件名 output_fn = red.sub(r'[\\/:*?"<>|]', r'', output_fn) if output_fn == '.txt': output_fn = '楼主.txt' # 输出内容 text = output.getvalue() output.close() # 覆盖判断:文件已存在 and 输出有内容 and (强制覆盖 or 选择覆盖) if os.path.isfile(output_fn) and \ text and \ (self.override.get() == 1 or \ messagebox.askyesno('输出文件已存在', '是否覆盖?\n%s' % output_fn) ): # 删除已有目标 try: os.remove(output_fn) except: pass # 写入output if not os.path.isfile(output_fn) and text: try: with open(output_fn, 'w', encoding='gb18030', errors='replace') as f: f.write(text) print('\n已保存为:', output_fn) except Exception as e: print('\n保存文件时出现异常', e) # 显示信息 size2 = os.path.getsize(output_fn) size2 = format(size2, ',') chinese_ct = format(chinese_ct, ',') print('输出文件 {0} 字节,约 {1} 个汉字。'.format( size2, chinese_ct) ) # 写入discard if discard_output != None: try: text = discard_output.getvalue() discard_output.close() if text: with open(discard_fn, 'w', encoding='gb18030', errors='replace') as f: f.write(text) except Exception as e: print('\n保存文件时出现异常', e) print() for line in info_list: if line.startswith('下载时间:'): break datamachine.save_print(line.rstrip('\n')) print('===================================\n')
def doit(self): # 获取、显示网址 try: u = self.master.clipboard_get().strip() except: bad = True u = '' else: bad = False if bad or not tz2txt.is_url(u): self.url.set('无效网址,网址须以http://或https://开头。') return self.url.set(u) # 辅助模式 assist = self.assist.get() if assist == 1: label = '' elif assist == 2: label = 'page' elif assist == 3: label = 'floor' # 末页 till = self.till.get().strip() try: till = int(till) except: till = -1 # 执行命令 self.status['fg'] = '#993300' self.status['text'] = '处理中' self.update() # except里return try: output, discard_output, title, info_list, chinese_ct = \ tz2txt.auto(u, till, '', '', label, from_gui=True) if title == None: raise Exception('无法完成全自动处理') except Exception as e: print('\n出现异常:', e) print('===================================\n') return else: # 显示标题 title = red.sub(r'[\U00010000-\U0010FFFF]', r'', title) title = title.strip() self.url.set(title) finally: self.status['fg'] = 'blue' self.status['text'] = '待机' # 输出文件名 if self.rename.get(): output_fn = title + '.txt' else: output_fn = self.output.get().strip() # 合法文件名 output_fn = red.sub(r'[\\/:*?"<>|]', r'', output_fn) if output_fn == '.txt': output_fn = '楼主.txt' # 输出内容 text = output.getvalue() output.close() # 覆盖判断:文件已存在 and 输出有内容 and (强制覆盖 or 选择覆盖) if os.path.isfile(output_fn) and \ text and \ (self.override.get() == 1 or \ messagebox.askyesno('输出文件已存在', '是否覆盖?\n%s' % output_fn) ): # 删除已有目标 try: os.remove(output_fn) except: pass # 写入output if not os.path.isfile(output_fn) and text: try: with open(output_fn, 'w', encoding='gb18030', errors='replace') as f: f.write(text) print('\n已保存为:', output_fn) except Exception as e: print('\n保存文件时出现异常', e) # 显示信息 size2 = os.path.getsize(output_fn) size2 = format(size2, ',') chinese_ct = format(chinese_ct, ',') print('输出文件 {0} 字节,约 {1} 个汉字。'.format(size2, chinese_ct)) # 写入discard if discard_output != None: try: text = discard_output.getvalue() discard_output.close() if text: with open(discard_fn, 'w', encoding='gb18030', errors='replace') as f: f.write(text) except Exception as e: print('\n保存文件时出现异常', e) print() for line in info_list: if line.startswith('下载时间:'): break datamachine.save_print(line.rstrip('\n')) print('===================================\n')