def check(): fetcher_info = FetcherInfo() f = Fetcher(fetcher_info) url = 'http://www.cnblogs.com/animalize/p/4773363.html' try: data = f.fetch_url(url) except: raise Exception('无法下载“版本发布网页”') try: html = data.decode('utf-8') except: raise Exception('无法用utf-8解码“版本发布网页”') p = r'【最新版本】(.*?)【结束】.*?【更新网址】(.*?)【结束】' r = red.re_dict(p, red.DOTALL) m = r.search(html) if not m: raise Exception('无法从“版本发布网页”提取最新的版本号') newver = m.group(1) download_url = m.group(2) return newver, download_url
def process_1(self): '''自定义处理''' # 处理引用 print('>处理引用') r = (r'^(?=(.*@@(\S{1,16})##))', r'\1', r'.*?', r'(?<=\n)', r'(?=(.*?(?<=\n)', re_separater, r'\s+))', r'(?!\3.*?(?<=\n)', re_separater, r'\s+)', r'\s*(.*?)\s*', re_separater, r'\s+(.*)') p = red.re_dict(''.join(r), red.DOTALL) quote_count = 0 for rpl in self.rlist: rpl.text, n = p.subn(r'回复 \2:\n【引用开始】\4\n【引用结束】\n\5', rpl.text) quote_count += n # # 使用'固化分组'处理引用 # print('>处理引用') # r = (r'^(?>.*@@(\S{1,16})##)', # r'.*?', # r'(?<=\n)', # r'(?=(?>.*?(?<=\n)', re_separater, r'\s+)', # r'(?!.*?(?<=\n)', re_separater, r'\s+))', # r'\s*(.*?)\s*', re_separater, r'\s+(.*)') # # p = red.re_dict(''.join(r), red.DOTALL) # # quote_count = 0 # for rpl in self.rlist: # rpl.text, n = p.subn(r'回复 \1:\n【引用开始】\2\n【引用结束】\n\3', # rpl.text) # quote_count += n color_p = color.fore_color(quote_count, color.Fore.CYAN) print('...处理了{0}条引用'.format(color_p))
def mark_reduplicate(self): '''标记相邻重复''' print('>检查相邻重复:') last_reply = None reduplicate_list = [] r = red.re_dict(r'^\s*$') # 查找重复 for rpl in self.rlist: if last_reply and last_reply.text == rpl.text and \ not r.match(rpl.text): reduplicate_list.append(rpl) last_reply = rpl # 处理重复 for i in reduplicate_list: i.text = self.append_note(i.text, '【与上一条回复重复】') i.suggest = False reduplicate_count = len(reduplicate_list) if reduplicate_count: color_p = color.fore_color(reduplicate_count, color.Fore.RED) else: color_p = color.fore_color(reduplicate_count, color.Fore.GREEN) print('...标记了{0}个重复回复'.format(color_p))
def mark_multireply(self): '''标记连续重复引用''' print('>开始标记 连续重复引用的回复') r = red.re_dict(r'^(.*?【引用开始】.*?)【引用结束】\n?(.*)$', red.S) last_reply = None # 最后一条引用回复 last_quote = None # 最后一条引用回复的引用部分 count = 0 for rpl in self.rlist: if not rpl.suggest: continue m = r.match(rpl.text) if m == None: last_reply = None last_quote = None continue temp = m.group(1) if last_quote == temp: last_reply.text += '\n\n【补充回复】\n' + m.group(2) rpl.text = '' rpl.suggest = False count += 1 else: last_reply = rpl last_quote = temp if count: color_p = color.fore_color(count, color.Fore.RED) else: color_p = color.fore_color(count, color.Fore.GREEN) print('...有{0}个连续重复引用的回复'.format(color_p))
def has_quote(reply): '''是否包含引用''' p = red.re_dict(r'^.*?【引用开始】.*?【引用结束】') if p.search(reply.text): return True else: return False
def has_unhandled_quote(self, reply): '''是否包含未处理的引用''' p1 = red.re_dict(r'@@\S{1,16}##') #p2 = red.re_dict(re_datetime) if p1.search(reply.text): # or p2.search(reply.text): return True else: return False
def get_processor(all_list): '''得到处理器''' processor = None if all_list: p = red.re_dict(r'<processor:\s*(.*?)\s*>') m = p.search(all_list[0]) if m: local_processor = m.group(1) processor = BaseProcessor.get_processor(local_processor) return processor
def should_pick(reply): p_space = red.re_dict(r'^\s*$') if p_space.match(reply.text): return False if reply.text.endswith('【与上一条回复重复】'): return False if reply.text.endswith('【无法处理的回复】'): return False return True
def is_url(url): p = red.re_dict( r'^https?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... r'localhost|' #localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', red.IGNORECASE|red.A) if p.match(url): return True else: return False
def bp_to_internal2(infile): '''编排文本 到 中间形式2''' all_list = list() pattern = red.re_dict(r'<(\d{4}-\d\d-\d\d\s+\d\d:\d\d:\d\d)') dt = lambda s:datetime.strptime(s, '%Y-%m-%d %H:%M:%S') temp = list() temp_date = None in_reply = False for line in infile.readlines(): line = line.rstrip('\n') if line.startswith('<time>'): if in_reply == True: print('格式错误:回复文本的前后包括标志不配对。\n', '丢失<mark>行') break m = pattern.search(line) if not m: print('无法解析日期') break temp_date = dt(m.group(1)) in_reply = True elif line.startswith('<mark>'): if in_reply == False: print('格式错误:回复文本的前后包括标志不配对。\n', '丢失<time>行') break if line.endswith('█'): select = True else: select = False # 添加回复 rpl = BPReply(temp_date, '\n'.join(temp), select) all_list.append(rpl) temp.clear() in_reply = False elif in_reply: temp.append(line) elif not in_reply: all_list.append(line) infile.close() if in_reply == True: print('格式错误:最后一个回复文本的前后包括标志不配对。') return all_list
def lianzai_fliter(processor, regex, flags): '''连载过滤器。注意:只标记suggest,不会考虑select''' print('>连载过滤器\n...正则式:{0}'.format(regex)) pattern = red.re_dict(regex, flags) count = 0 for reply in processor.rlist: if reply.suggest: if not processor.has_quote(reply) \ and pattern.search(reply.text): count += 1 else: reply.suggest = False print('...选择了{0}条回复作为连载'.format(count))
def mark_empty(self): '''标记空回复''' print('>标记空白回复:') p = red.re_dict(r'^\s*$') blank_count = 0 for rpl in self.rlist: if p.match(rpl.text): rpl.suggest = False blank_count += 1 if blank_count: color_p = color.fore_color(blank_count, color.Fore.RED) else: color_p = color.fore_color(blank_count, color.Fore.GREEN) print('...标记了{0}个空白回复'.format(color_p))
def do_re_list(self): '''用re_list进行替换处理''' print('>用正则式列表替换') # 编译 for i in self.re_list: i.append(red.re_dict(''.join(i[0]), i[1])) process_count = 0 for rpl in self.rlist: #i = 0 for r in self.re_list: rpl.text, n = r[3].subn(r[2], rpl.text) process_count += 1 if n > 0 else 0 #if '某些文字' in rpl.text: # print(rpl.text, '\n', i, '>>>>>>>') # i += 1 print('...做了{0}次替换'.format(process_count))
def process_1(self): '''自定义处理''' # 处理引用 print('>处理引用') r = (r'^(?=(.*@@(\S{1,16})##))', r'\1', r'.*?', r'(?<=\n)', r'(?=(.*?(?<=\n)', re_separater, r'\s+))', r'(?!\3.*?(?<=\n)', re_separater, r'\s+)', r'\s*(.*?)\s*', re_separater, r'\s+(.*)') p = red.re_dict(''.join(r), red.DOTALL) quote_count = 0 for rpl in self.rlist: rpl.text, n = p.subn(r'回复 \2:\n【引用开始】\4\n【引用结束】\n\5', rpl.text) quote_count += n color_p = color.fore_color(quote_count, color.Fore.CYAN) print('...处理了{0}条引用'.format(color_p))
def bp_to_final(infile, keep_discard=True, label=0): '''编译 编排to最终、丢弃''' class placeholder: def __init__(self, posi=0, pagenum=0, show=False): self.posi = posi self.pagenum = pagenum self.show = show def is_not_empty(lst): for i in lst: yield i.strip() != '' info_list = list() holder_list = [placeholder()] text_list = list() abandon_list = list() pickcount, allcount = 0, 0 # 用于把 [img]http://img3.laibafile.cn/p/m/1234567.jpg[/img] # 替换成 【图片:1234567.jpg】 picr = (r'\[img\s*(\d+|)\].*?\[/img\]') pattern = red.re_dict(picr) # 提取页号 re_pagenum = red.re_dict(r'^<page>页号:\s*(\d+)\s*$') # 提取时间 p_time = (r'^<time>[^<]*<\d\d(\d\d-\d{1,2}-\d{1,2})\s+' r'(\d{1,2}:\d{1,2})') re_time = red.re_dict(p_time) # 读取编排文本 in_reply = False temp = list() current_page = 0 current_time = '' for line in infile.readlines(): if line.startswith('<time>'): if in_reply == True: print('格式错误:回复文本的前后包括标志不配对。\n', '丢失<mark>行') break in_reply = True # current_time if label == 2: m = re_time.search(line) if m: current_time = m.group(1) + ' ' + m.group(2) else: current_time = '' elif line.startswith('<mark>'): if in_reply == False: print('格式错误:回复文本的前后包括标志不配对。\n', '丢失<time>行') break if line.endswith('█\n') or line.endswith('█'): pickcount += 1 if label == 0: pass elif label == 1: holder_list[-1].show = True elif label == 2: floor_label = ('№.%d ☆☆☆' ' 发表于%s P.%d ' '☆☆☆\n' '-------------------------' '-------------------------' '\n') floor_label = floor_label % \ (pickcount, current_time, current_page) text_list.append(floor_label) text_list.extend(temp) text_list.append('\n') elif any(is_not_empty(temp)): abandon_list.extend(temp) abandon_list.append('∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞∞\n\n') temp.clear() allcount += 1 in_reply = False elif in_reply: line = pattern.sub(r'【一张图片\1】', line) temp.append(line) # 由于上一个elif,以下必定not in_reply elif not text_list and not abandon_list and \ line.startswith('<tiezi>'): info_list.append(line[len('<tiezi>'):]) elif label != 0: m = re_pagenum.search(line) if m: current_page = int(m.group(1)) if label == 1: text_list.append('') holder = placeholder(len(text_list)-1, current_page ) holder_list.append(holder) infile.close() if in_reply == True: print('格式错误:最后一个回复文本的前后包括标志不配对。') # 页码 辅助格式 if label == 1: for holder in holder_list[1:]: if holder.show: page_label = ('☆☆☆☆☆' ' 进入第%d页 ' '☆☆☆☆☆\n' '----------------' '----------------' '\n\n') % holder.pagenum text_list[holder.posi] = page_label color_p1 = color.fore_color(allcount, color.Fore.YELLOW) color_p2 = color.fore_color(pickcount, color.Fore.YELLOW) print('共有{0}条回复,选择了其中{1}条回复'.format(color_p1, color_p2)) # output的内容============ # 连接 if info_list: s_iter = itertools.chain(info_list, '\n', text_list) else: s_iter = iter(text_list) s = ''.join(s_iter) # 连续的多张图片 s = red.sub(r'(?:【一张图片(\d+|)】\s+){3,}', r'【多张图片\1】\n\n', s) s = red.sub(r'(?:【一张图片(\d+|)】\s+){2}', r'【两张图片\1】\n\n', s) # 输出StringIO output = StringIO(s) # 汉字字数 chinese_ct = count_chinese(s) # 丢弃文本 if keep_discard and abandon_list: s_iter = itertools.chain(info_list, '\n', abandon_list) s = ''.join(s_iter) discard = StringIO(s) else: discard = None return output, discard, info_list, chinese_ct
def statistic(self): '''统计''' # 回复总数 -------------------------- print('回复总数:', len(self.rlist)) # 选择的回复数 selected_count = sum(1 for r in self.rlist if r.select) print('选择的回复数:', selected_count) print() # 字数统计 -------------------------- print('以下的统计不包括空白、重复和无法处理的回复:\n') # 排除不想参与统计的回复 p_space = red.re_dict(r'^\s*$') def should_pick(reply): if p_space.match(reply.text): return False if reply.text.endswith('【与上一条回复重复】'): return False if reply.text.endswith('【无法处理的回复】'): return False return True lenlist = [ self.reply_len_quote(r) for r in self.rlist if should_pick(r) ] # 有引用回复 的 引用部分长度 qlenlist = [x[0] for x in lenlist if x[0] != -1] # 有引用回复 的 回复部分长度 rlenlist = [x[1] for x in lenlist if x[0] != -1] # 无引用回复 的 长度 noqlenlist = [x[2] for x in lenlist if x[0] == -1] del lenlist def num(lst, func): if not lst: return 0 else: return func(lst) print(' (引用部分 回复部分) 无引用回复') print(' 总 数 : {0:<8} + {1:<8} = {2}'.format( len(qlenlist), len(noqlenlist), len(qlenlist) + len(noqlenlist))) print('最长的字数: {0:<8} {1:<8} {2:<8}'.format(num(qlenlist, max), num(rlenlist, max), num(noqlenlist, max))) print('字数平均数: {0:<8.2f} {1:<8.2f} {2:<8.2f}'.format( num(qlenlist, statistics.mean), num(rlenlist, statistics.mean), num(noqlenlist, statistics.mean))) print('字数中位数: {0:<8.0f} {1:<8.0f} {2:<8.0f}'.format( num(qlenlist, statistics.median), num(rlenlist, statistics.median), num(noqlenlist, statistics.median))) print('总体标准差: {0:<8.2f} {1:<8.2f} {2:<8.2f}'.format( num(qlenlist, statistics.pstdev), num(rlenlist, statistics.pstdev), num(noqlenlist, statistics.pstdev))) # 字数分布 ------------------------------ # e_table由y=e**x函数生成 x:0.5,1.0,1.5,2.0,2.5,3.0... e_table = [0, 7, 12, 20, 33, 55, 90, 148, 245, 403, \ 665, 1097, 1808, 2981, 4915, 8103, 13360] # 字数分布函数 def get_len_distribution(lenlist): '''字数分布''' table_len = len(e_table) count_table = [0 for i in range(table_len + 1)] for length in lenlist: for i in range(table_len): if length < e_table[i]: count_table[i] += 1 break else: count_table[-1] += 1 return count_table # 得到字数分布 qdis = get_len_distribution(qlenlist) rdis = get_len_distribution(rlenlist) ndis = get_len_distribution(noqlenlist) # 打印字数分布 print('\n字数分布') print(' ' * 16, '(引用部分 回复部分) 无引用回复') for i in range(1, len(e_table)): print('{0:>6}<= x <{1:<5} : {2:<8} {3:<8} {4:<8}'.format( e_table[i - 1], e_table[i], qdis[i], rdis[i], ndis[i])) print('{0:>6}<= x : {1:<8} {2:<8} {3:<8}'.format( e_table[-1], qdis[-1], rdis[-1], ndis[-1])) print(' ' * 8, '=' * 35) print( ' ' * 12, '总数 : {0:<8} {1:<8} {2:<8}'.format(len(qlenlist), len(rlenlist), len(noqlenlist)))
def statistic(self): '''统计''' # 回复总数 -------------------------- print('回复总数:', len(self.rlist)) # 选择的回复数 selected_count = sum(1 for r in self.rlist if r.select) print('选择的回复数:', selected_count) print() # 字数统计 -------------------------- print('以下的统计不包括空白、重复和无法处理的回复:\n') # 排除不想参与统计的回复 p_space = red.re_dict(r'^\s*$') def should_pick(reply): if p_space.match(reply.text): return False if reply.text.endswith('【与上一条回复重复】'): return False if reply.text.endswith('【无法处理的回复】'): return False return True lenlist = [self.reply_len_quote(r) for r in self.rlist if should_pick(r)] # 有引用回复 的 引用部分长度 qlenlist = [x[0] for x in lenlist if x[0] != -1] # 有引用回复 的 回复部分长度 rlenlist = [x[1] for x in lenlist if x[0] != -1] # 无引用回复 的 长度 noqlenlist = [x[2] for x in lenlist if x[0] == -1] del lenlist def num(lst, func): if not lst: return 0 else: return func(lst) print(' (引用部分 回复部分) 无引用回复') print(' 总 数 : {0:<8} + {1:<8} = {2}'.format( len(qlenlist), len(noqlenlist), len(qlenlist) + len(noqlenlist) ) ) print('最长的字数: {0:<8} {1:<8} {2:<8}'.format( num(qlenlist, max), num(rlenlist, max), num(noqlenlist, max) ) ) print('字数平均数: {0:<8.2f} {1:<8.2f} {2:<8.2f}'.format( num(qlenlist, statistics.mean), num(rlenlist, statistics.mean), num(noqlenlist, statistics.mean) ) ) print('字数中位数: {0:<8.0f} {1:<8.0f} {2:<8.0f}'.format( num(qlenlist, statistics.median), num(rlenlist, statistics.median), num(noqlenlist, statistics.median) ) ) print('总体标准差: {0:<8.2f} {1:<8.2f} {2:<8.2f}'.format( num(qlenlist, statistics.pstdev), num(rlenlist, statistics.pstdev), num(noqlenlist, statistics.pstdev) ) ) # 字数分布 ------------------------------ # e_table由y=e**x函数生成 x:0.5,1.0,1.5,2.0,2.5,3.0... e_table = [0, 7, 12, 20, 33, 55, 90, 148, 245, 403, \ 665, 1097, 1808, 2981, 4915, 8103, 13360] # 字数分布函数 def get_len_distribution(lenlist): '''字数分布''' table_len = len(e_table) count_table = [0 for i in range(table_len+1)] for length in lenlist: for i in range(table_len): if length < e_table[i]: count_table[i] += 1 break else: count_table[-1] += 1 return count_table # 得到字数分布 qdis = get_len_distribution(qlenlist) rdis = get_len_distribution(rlenlist) ndis = get_len_distribution(noqlenlist) # 打印字数分布 print('\n字数分布') print(' '*16, '(引用部分 回复部分) 无引用回复') for i in range(1, len(e_table)): print('{0:>6}<= x <{1:<5} : {2:<8} {3:<8} {4:<8}'.format( e_table[i-1], e_table[i], qdis[i], rdis[i], ndis[i] ) ) print('{0:>6}<= x : {1:<8} {2:<8} {3:<8}'.format( e_table[-1], qdis[-1], rdis[-1], ndis[-1] ) ) print(' '*8,'='*35) print(' '*12, '总数 : {0:<8} {1:<8} {2:<8}'.format( len(qlenlist), len(rlenlist), len(noqlenlist) ) )