def pick_mobile(root_src, root_dest, name_exception_file): """ 从root下所有文本文件中将严格符合格式的手机号和可能包含手机号的行分开保存 :param root_src: 输入文件夹路径 :param root_dest: 输出文件夹路径 :param name_exception_file: 可能包含手机号的行的保存文件名 """ path_exception_file = root_dest + '\\' + name_exception_file if not os.path.isdir(root_dest): os.makedirs(root_dest) exception_buf = set() # 缓存异常手机号 for file in files(root_src): fileBuf = set() # 缓存正常手机号 with open(file, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if len(line) == 11: # 其实就用长度区分,因为之前处理下来,这里只要看长度就可以了。有可能要调整 fileBuf.add(line) elif len(line) > 11: exception_buf.add(line) # 在root_dest保存相同的文件名 file_opt_path = file[len(root_src):] file_opt_path = root_dest + '\\' + file_opt_path # 写出本文件中正常手机号 logger.info("Writing normal mobile to file: {}.".format(file_opt_path)) write_list_to_file(file_opt_path, fileBuf) logger.info( "Writing exceptional mobile to file: {}.".format(path_exception_file)) write_list_to_file(path_exception_file, exception_buf)
def process_file(self): logger.info("开始文件,路径:%s,类型:%s" % (self.file, self.name)) self.read_data() # 读取文件数据 self.split_multi_target() # 调用桥接模式实现类实现的提取数据方法 self._process_impl.process_data(self._src_data) self._results = self._process_impl.results # 取回处理结果 logger.info("完成文件,路径:%s,结果 %s %d个" % (self.file, self._process_impl.name, len(self.results)))
def cnt_mobile_content(self, chat): """计数chat中聊天内容中的文件包含的手机号数""" if not self.is_target_chat_ext(chat): return None, "" file_name = chat.content.split('\\')[-1] possible_dirs = search_file(file_name, self.root_search) str_paths = ';'.join(possible_dirs) if not possible_dirs: result = -1 logger.info("Failed locating file: {}.".format(file_name)) else: result = self.cnt_from_multi_path(possible_dirs) return result, str_paths
def wrapper_count_mobile(self, file): """ 计数file中的手机号数,若出错或为零则复制到root_exception下""" try: num = len(count_unique_mobile(file)) if num == 0: raise Exception else: return num except Exception: try: fileName = file.split('\\')[-1] tarPath = self.root_exception + '\\' + fileName shutil.copy(file, tarPath) logger.info("Copy file to exception: {}.".format(tarPath)) return None except Exception: # 在复制时出错 traceback.print_exc()
def handle_process_err(self, processor, err): # traceback.print_exc() logger.exception(str(err)) file = processor.file # 异常,不是word的情况 if not is_doc_file(file): pathCutTo = self.root_exception if "encrypted" in str(err): pathCutTo = self.root_encrypted elif "codec" in str(err): pathCutTo = self.root_codec elif "Found no" in str(err): pathCutTo = self.root_no_results self.cut_file_with_path(file, pathCutTo) # 异常,是word的情况 else: try: self.cut_file_with_path(file, self.root_exception) if ext_in(file, ['doc']): docxPath = change_ext_to(file, 'docx') if os.path.isfile(docxPath): logger.info("Remove file: {}".format(docxPath)) os.remove(docxPath) # 这个异常可能由于doc文件未被word释放时剪切文件所引发,打印时可能再次引发未知原因的异常 except: try: logger.exception(str(err)) # traceback.print_exc() except: print("Error raised in traceback.")
def merge_text(name, root_src, root_dest="", start_id=1, len_result_file=1000000): """ 将rootSource中的所有文本文件合并 行数达len_result_file时写入文件保存再准备写入一个新文件 结果文件命名是由name+startID组成,如"name1.txt", "name2.txt", ... :param name: 合并结果文本文件的名称部分 :param root_src: 合并来源文本文件所在的文件夹路径 :param root_dest: 合并结果文本文件保存的文件夹路径,若不指定位置,在root_src同目录下生成文件夹: mergeResult_+name :param start_id: 合并结果文本文件的id后缀部分 :param len_result_file: 每个输出文件的最大行数 """ mergeBuffer = [] if not root_dest: p1 = os.path.split(root_src)[0] root_dest = p1 + '\\mergeResult_' + name if not os.path.isdir(root_dest): os.makedirs(root_dest) else: logger.warning("root_dest已存在,同名合并结果文件将被覆盖") for file in files(root_src): logger.info("Reading file: {}.".format(file)) with open(file, 'rb') as fRead: for line in fRead: line = try_decode(line).strip() mergeBuffer.append(line) while len(mergeBuffer) > len_result_file: mergeBufferTail = mergeBuffer[len_result_file:] mergeBuffer = mergeBuffer[:len_result_file] opt_txt_path = root_dest + '\\' + name + str(start_id) + '.txt' logger.info("Writing file: {}.".format(opt_txt_path)) write_list_to_file(opt_txt_path, mergeBuffer) mergeBuffer = mergeBufferTail start_id += 1 opt_txt_path = root_dest + '\\' + name + str(start_id) + '.txt' logger.info("Writing file: {}.".format(opt_txt_path)) write_list_to_file(opt_txt_path, mergeBuffer)
def process_files(self): """主方法,负责调用多进程""" start_time = time() # doc转换docx的情况比较特别,没法多进程,所以目前打算单独拿到流程最前面转换成docx,转换好把原始doc文件删掉, # 之后下面多进程再处理docx文件 if docImported: print("准备将doc文件转换为docx文件\n注意:转换期间不要用word打开任何文档,否则转换会失败或终止") if input("输入'y'继续,输入'n'跳过doc转换步骤").lower() == 'y': print("开始转换") logger.info("Start transferring doc to docx.") for file in files(self.root_src): self.doc_to_docx(file) print("准备删除转换成功的doc文件\n注意:继续前请手动备份") input("按任意键继续") print("开始删除") logger.info("Start deleting transferred doc file.") for file in files(self.root_src): self.del_transferred_doc(file) print("准备开始正式的处理过程\n" "注意:确保将源文件夹({})内剩余doc文件(应该是转换失败或转换成功后删除失败的)手动处理后再继续" .format(self.root_src)) input("按任意键继续") else: print("win32com无法导入,忽视doc和docx的处理,会剪切到{}".format(self.root_exception)) cores = multiprocessing.cpu_count() if self.full_core else int(multiprocessing.cpu_count() / 2) # 用满核或一半核 pool = multiprocessing.Pool(processes=cores) pool.imap(self.process_file, files(self.root_src)) pool.close() pool.join() logger.info("Done. Time spent: {}.".format(time() - start_time))