def launchOnlyHashingByID(self, sample): sample.setPackageController(self.pc) sample.setMetaController(self.mdc) sample.setVersionController(self.vc) category = sample.getCategory() if (category == None): category = Cataloger().catalog(sample.getBinary()) logging.debug("Category not found in DB, categorized as %s", str(category)) else: logging.debug("Category found in DB, categorized as %s", str(category)) processor = ProcessorFactory().getHashProcessor(category, sample) result_dic = processor.process() result_version = processor.getVersion() if (len(result_version) > 0): logging.debug("Updating metadata") if (self.mdc.write(sample.getID(), result_dic) != 0): logging.error("Error writing Metadata to DB, sample:%s", sample.getID()) return -1 logging.debug("Metadata writed in DB") self.vc.updateVersion(sample.getID(), result_version) logging.debug("Versions writed to DB") else: logging.debug("Nothing to update") logging.debug("Analysis Finished OK") return 0
def launchOnlyHashingByID(self, sample): sample.setPackageController(self.pc) sample.setMetaController(self.mdc) sample.setVersionController(self.vc) category = sample.getCategory() if(category is None): category = Cataloger().catalog(sample.getBinary()) logging.debug( "Category not found in DB, categorized as %s", str(category)) else: logging.debug( "Category found in DB, categorized as %s", str(category)) processor = ProcessorFactory().getHashProcessor(category, sample) result_dic = processor.process() result_version = processor.getVersion() if(len(result_version) > 0): logging.debug("Updating metadata") if(self.mdc.write(sample.getID(), result_dic) != 0): logging.error( "Error writing Metadata to DB, sample:%s", sample.getID()) return -1 logging.debug("Metadata writed in DB") self.vc.updateVersion(sample.getID(), result_version) logging.debug("Versions writed to DB") else: logging.debug("Nothing to update") logging.debug("Analysis Finished OK") return 0
def __init__(self, root_src, pro_type, full_core=False): """ :param root_src: 待处理文件根目录 :param pro_type: 处理类型,如手机号、身份证,默认手机号。整数,在ProcessorFactory的类属性中 :param full_core: 是否用全部的内核处理 """ self.root_src = root_src self.pro_type = pro_type self.full_core = full_core p1 = os.path.split(root_src)[0] self.root_output = os.path.join(p1, 'output') self.root_complete = os.path.join(p1, 'complete') self.root_exception = os.path.join(p1, 'exception') self.root_encrypted = os.path.join(p1, 'exception_encrypted') self.root_codec = os.path.join(p1, 'exception_codex') self.root_no_results = os.path.join(p1, 'exception_noResult') self.process_factory = ProcessorFactory()
class MobileCollector: """ 整个手机号提取脚本的入口类,目前用桥接模式实现了其他处理方法的接口,可以增加提取身份证号或其他信息的实现类 Processors/AbstractProcessors中的类负责从各类型文件中读取数据到内存中 Processors/ProcessorImpls中的类负责对读取获得的数据做具体处理,如提取手机号 """ def __init__(self, root_src, pro_type, full_core=False): """ :param root_src: 待处理文件根目录 :param pro_type: 处理类型,如手机号、身份证,默认手机号。整数,在ProcessorFactory的类属性中 :param full_core: 是否用全部的内核处理 """ self.root_src = root_src self.pro_type = pro_type self.full_core = full_core p1 = os.path.split(root_src)[0] self.root_output = os.path.join(p1, 'output') self.root_complete = os.path.join(p1, 'complete') self.root_exception = os.path.join(p1, 'exception') self.root_encrypted = os.path.join(p1, 'exception_encrypted') self.root_codec = os.path.join(p1, 'exception_codex') self.root_no_results = os.path.join(p1, 'exception_noResult') self.process_factory = ProcessorFactory() def process_files(self): """主方法,负责调用多进程""" start_time = time() # doc转换docx的情况比较特别,没法多进程,所以目前打算单独拿到流程最前面转换成docx,转换好把原始doc文件删掉, # 之后下面多进程再处理docx文件 if docImported: print("准备将doc文件转换为docx文件\n注意:转换期间不要用word打开任何文档,否则转换会失败或终止") if input("输入'y'继续,输入'n'跳过doc转换步骤").lower() == 'y': print("开始转换") logger.info("Start transferring doc to docx.") for file in files(self.root_src): self.doc_to_docx(file) print("准备删除转换成功的doc文件\n注意:继续前请手动备份") input("按任意键继续") print("开始删除") logger.info("Start deleting transferred doc file.") for file in files(self.root_src): self.del_transferred_doc(file) print("准备开始正式的处理过程\n" "注意:确保将源文件夹({})内剩余doc文件(应该是转换失败或转换成功后删除失败的)手动处理后再继续" .format(self.root_src)) input("按任意键继续") else: print("win32com无法导入,忽视doc和docx的处理,会剪切到{}".format(self.root_exception)) cores = multiprocessing.cpu_count() if self.full_core else int(multiprocessing.cpu_count() / 2) # 用满核或一半核 pool = multiprocessing.Pool(processes=cores) pool.imap(self.process_file, files(self.root_src)) pool.close() pool.join() logger.info("Done. Time spent: {}.".format(time() - start_time)) def process_file(self, file): """从处理器工厂获得处理器来处理file,调用处理异常和处理成功的处理方法""" p = self.process_factory.create_processor(file, self.pro_type) try: p.process_file() path_opt = self.make_path_opt(file) write_list_to_file(path_opt, p.results) except Exception as e: self.handle_process_err(p, e) else: self.handle_process_suc(p) def handle_process_err(self, processor, err): # traceback.print_exc() logger.exception(str(err)) file = processor.file # 异常,不是word的情况 if not is_doc_file(file): pathCutTo = self.root_exception if "encrypted" in str(err): pathCutTo = self.root_encrypted elif "codec" in str(err): pathCutTo = self.root_codec elif "Found no" in str(err): pathCutTo = self.root_no_results self.cut_file_with_path(file, pathCutTo) # 异常,是word的情况 else: try: self.cut_file_with_path(file, self.root_exception) if ext_in(file, ['doc']): docxPath = change_ext_to(file, 'docx') if os.path.isfile(docxPath): logger.info("Remove file: {}".format(docxPath)) os.remove(docxPath) # 这个异常可能由于doc文件未被word释放时剪切文件所引发,打印时可能再次引发未知原因的异常 except: try: logger.exception(str(err)) # traceback.print_exc() except: print("Error raised in traceback.") def handle_process_suc(self, processor): file = processor.file self.cut_file_with_path(file, self.root_complete) def cut_file_with_path(self, src, dest_root): """ 从root_src剪切src文件到目标根目录,保留目录树 :param src: 要剪切的文件路径 :param dest_root: 目标根目录 :return: """ if not os.path.isfile(src): print("Source file to cut from doesn't exist: {}".format(src)) return False tail = src[len(self.root_src):] dest_path = dest_root + tail dir = os.path.split(dest_path)[0] if not os.path.isdir(dir): os.makedirs(dir) if not os.path.isfile(dest_path): try: shutil.move(src, dest_path) except PermissionError: logger.critical("PermissionError, cut failed: {}.".format(src)) return False else: logger.warning("File already exists at destination, fromPath = {}, dest_path = {}.".format(src, dest_path)) return False return True def make_path_opt(self, file, prefix=""): """ 将文件路径前的root_src部分替换为root_output,作为该文件处理结果的输出文件路径 和原文件保持同样的目录关系 :param file: 文件路径 :param prefix: :return: """ part1, ext = os.path.splitext(file) pathOptText = part1 + '.txt' pathOptText = self.root_output + pathOptText[len(self.root_src):] p1, p2 = os.path.split(pathOptText) p2 = prefix + p2 pathOptText = os.path.join(p1, p2) # 可能有多个进程同时进入此代码块并尝试新建不存在的目录,第一个进程建立后,之后的进程就会触发异常,无害 # 确保输出文件路径上的目录结构存在 if not os.path.isdir(p1): try: os.makedirs(p1) except FileExistsError: pass return pathOptText def doc_to_docx(self, file): """ 若file后缀名为doc,尝试用win32com另为docx文件,以同名保存在同目录下 :param file: :return: """ if ext_in(file, ['doc']) and not os.path.isfile(change_ext_to(file, 'docx')): try: # 这个pro_type 必须传入,因为不处理数据,所以无所谓类别,仅为了调用DocProcessor里的另存word方法 p = self.process_factory.create_processor(file, pro_type=ProcessorFactory.pro_mobile) p.doc_to_docx() except: print("Save doc as docx failed: {}.".format(file)) def del_transferred_doc(self, file): """ 若file后缀名为doc,且同目录下有同名的docx文件,则删除doc文件,保留docx文件 :param file: :return: """ if ext_in(file, ['doc']) and os.path.isfile(change_ext_to(file, 'docx')): try: os.remove(file) except: logger.critical("Remove doc: {} failed.".format(file))