def get(self): if request.method != "GET": return "error, method not allowed !" # mysql_db = PyMysql(HOST, PORT, USER, PASSWORD, DATABASE) # print(type(mysql_db)) # # connection, cursor = mysql_db.connectAndGetCursor() # current_app.logger.info(type(connection)) # current_app.logger.info(type(cursor)) # # # 查询表数据 # sql = """select CONCAT("alter table ",a.table_name," convert to character set utf8mb4 collate utf8mb4_bin;") from (select table_name from information_schema.`TABLES` where TABLE_SCHEMA = "contract") a;""" # # res = mysql_db.executeBySelect(connection, cursor, sql) # for sql_str in res: # print(sql_str[0]) # # current_app.logger.info(res) # res = re.search(r"\d+", "read12343123asd") res = {"a": 123} # cache.set("test", serialize(HtmlItem("test"))) # cache.lpush("aaa", serialize(HtmlItem("zxcvbnm"))) # res = cache.get('test') # # a = cache.rpop("aaa") # logger.info(f"{res, type(res)}") # logger.info(f"{a, type(a)}") logger.info("test") return {"code": 200, "mes": "OK", "data": [res, "a"]}
def uncompress(self, compressed_files_path): """ 对传入的压缩文件进行内部解压处理 Args: compressed_files_path: 需要解压的文件地址 Returns: """ logger.info(f"start uncompress file: {compressed_files_path}") if compressed_files_path and os.path.exists(compressed_files_path): # 根据文件类型做解压操作 uncompress_dir = self.un_pack(compressed_files_path) # 解压后的文件归一处理 files_path = self.file_processing( uncompress_dir) if uncompress_dir else None # 遍历解压后的文件,判断是否还含有压缩包 self.recursion_decompressing(files_path) if files_path else None # 此处表示当前文件夹中没有需要再次解压的压缩包,整理内部所有文件,转移至一处 files_path = self.file_processing(uncompress_dir) # 删除文件夹中的隐藏文件 # self.del_file(files_path) else: logger.info(f"文件不存在、或路径存在问题,请检查!") files_path = "" return files_path
def word_to_pdf_single(word_path: str, dir_path, delete=True): """ ubuntu中使用命令行将doc、docx文件转换为pdf,并根据状态是否删除源文件 Args: word_path: doc、docx文件路径 dir_path: 转换后文件保存位置 delete: 是否删除,默认删除 Returns: """ try: if word_path and os.path.exists(word_path): try: status = os.system( f"soffice --headless -convert-to pdf {word_path} --outdir {dir_path}" ) if status in [0, "0"] and delete: try: os.remove(word_path) logger.info(f"rm {word_path}") except Exception as e: logger.exception(e) except Exception as e: logger.exception(e) except Exception as e: logger.exception(e)
def download_page(self, item): """ 获取下一页的网页地址等 根据当前状态标记新的状态 0:初始页面 1:全局列表页面 2: 抽取下载连接 """ item = MyDict(item) try: print(f" download_page info: {item.url}") logger.info("开始下载网页!{}。类型:{}".format(item.url, item.type)) # current_app.logger.info("开始下载网页!{}。类型:{}".format(item.url, item.type)) old_type = item.type if item.url != "None" and not cache.get(item.url): # if item.url != "None": html_obj = requests.get(item.url, headers=self.headers) html_str = html_obj.content.decode("gbk") item.content = html_str print(len(html_str)) # 请求结果存入redis数据库 cache.set(item.url, html_str) cache.expire(item.url, self.redis_key_timeout) # item.xpath_obj = lxml.html.fromstring(html_str) logger.info("下载前类型:{}, 下载后类型:{}".format(old_type, item.type)) self.push_item_in_redis_list(self.message_b, item) except Exception as e: logger.error("download_page: {}".format(e))
def _wrap(*args, **kwargs): st = time.time() rst = func(*args, **kwargs) et = time.time() output_str = "func: '{}' time: {}s".format(func.__name__, et - st) logger.info(output_str) return rst
def executeByInsOrUpdOrDel(self, connection, cursor, sql: str, values=None): """ 执行单个sql语句 :param connection: 数据库连接对象 :param cursor: mysql游标 :param sql: 要执行的sql语句 :param values: 需要插入的数据 :return: """ try: res = [] if connection and cursor and sql: if values: if isinstance(values, tuple): cursor.execute(sql, values) elif isinstance(values, list): cursor.executemany(sql, values) connection.commit() res = cursor.fetchall() logger.info("操作成功") logger.info("执行结果:{}".format(res)) return res logger.info("错误:") return res logger.info("None") return res except Exception as e: logger.error("操作失败:{}".format(e)) connection.roback() return None
def file_processing(uncompress_dir): """ 移动转移文件到当前目录 Args: uncompress_dir: Returns: """ logger.info(f"start organize files : {uncompress_dir}") if os.path.exists(uncompress_dir): for path, dirs, files in os.walk(uncompress_dir): files_list = [ os.path.join(path, file) for file in files if files ] [ shutil.move(x, uncompress_dir) for x in files_list if not os.path.exists( os.path.join(uncompress_dir, os.path.basename(x))) ] logger.info(f"文件汇总转移完毕!") [[shutil.rmtree(os.path.join(path, son_dir)) for son_dir in dirs] for path, dirs, files in os.walk(uncompress_dir)] logger.info(f"空文件夹已清空!") [[ os.remove(os.path.join(path, son_dir)) for son_dir in dirs if son_dir.startswith(".") ] for path, dirs, files in os.walk(uncompress_dir)] logger.info(f"无关文件已删除!") return uncompress_dir
def _word_to_pdf_batch(dir_path, transform=None, delete=True): if transform is None: transform = ["doc", "docx"] try: if dir_path and os.path.exists(dir_path) and transform: for file_type in transform: if os.system( f"soffice --headless -convert-to pdf {dir_path}/*.{file_type} --outdir {dir_path}" ) in [0, "0"] and delete: try: os.system(rf"rm -f {dir_path}/*.{file_type}") logger.info(f"rm -f {dir_path}/*.{file_type}") except Exception as e: logger.exception(e) except Exception as e: logger.exception(e)
def get_message(self, key, func): try: index = 0 while True: message = cache.rpop(key) if message: logger.info(f"{key}, message{message}") # message = deserialization(self.__cls__().Default, message, url="default") merge_process = Thread(target=func, args=(eval(message), )) merge_process.start() index = 0 else: if index >= self.timeout: exit() index += 1 # logger.info(f"{key}: {index}") time.sleep(1) except Exception as e: logger.exception(e)
def get(self): """ # @description: # @param {type} # @return: """ try: key = request.args.get("key", None) if key == "search": values = request.args.get("search", None) if values: search_res = ZeroCrawler(values).start() print(search_res) res = {"code": 200, "msg": "OK", "data": search_res} logger.info("请求成功:{}".format(res)) return res res = {"code": 1002, "msg": "error", "data": []} logger.info("请求失败:{}".format(res)) return res res = {"code": 1000, "msg": "不做处理", "data": []} logger.info("请求成功:{}".format(res)) return res except Exception as e: logger.error(e)
def word_to_pdf_batch(self, dir_path): """ 文件夹内的word文件批量转换为PDF, 主要使用的是word中插入的图片,对文本没有要求 Args: dir_path: 需要处理的文件夹 Returns: """ if dir_path: try: all_word_files = [] for root, dirs, files in os.walk(dir_path): [ self.word_to_pdf_single(os.path.join(root, x), dir_path) for x in files if x.rsplit(".", 1)[-1] in ['doc', "docx"] and not x.startswith(".") ] logger.info(all_word_files) except Exception as e: logger.exception(e)
def processes(crawlerProcess, start_url): logger.info("start to get info with dytt") number, result = crawlerProcess(start_url).start() logger.info("start write info to mysql") # self.write_to_mysql(result, dyHeaven_fields, DyHeaven) print("数据库字段:{}".format(dyHeaven_fields)) logger.info("抽取到的结果:{}".format(result)) return number, result
def executeBySelect(self, connection, cursor, sql: str): try: res = [] if connection and cursor and sql: cursor.execute(sql) res = cursor.fetchall() logger.info("操作成功") logger.info("执行结果:{}".format(res)) return res logger.info("缺少关键参数") return res except Exception as e: logger.error(e) return []
def start(self): try: self.push_item_in_redis_list(self.message_a, self.start_item) thr_downloader = Thread(target=self.get_message, args=(self.message_a, self.download_page)) thr_primary = Thread(target=self.get_message, args=(self.message_b, self.primary)) thr_merge_result = Thread(target=self.get_message, args=(self.message_c, self.merger_result)) thr_downloader.start() logger.info("下载线程开启!") time.sleep(3) thr_primary.start() logger.info("抽取线程开启!") thr_merge_result.start() logger.info("数据合并线程开启") thr_merge_result.join() return self.result except Exception as identifier: logger.exception(identifier)
def check_libre_office_status(): """ 检查系统是否安装了libreoffice,没有安装的话,进行软件的安装 Returns: """ try: office_info = os.system("libreoffice --version") logger.info(f"{office_info}") if office_info not in [0, "0"]: logger.info(f"系统内部没有安装") logger.info(f"start install libreoffice") if os.system("yum install -y libreoffice") in [0, "0"]: logger.info(f"安装成功") else: logger.info(f"安装失败") else: logger.info(f"系统已安装") except Exception as e: logger.exception(e)
def primary(self, item): """ 抽取主方法,抽取连接等所需要等内容 """ item = MyDict(item) try: logger.info("开始抽取:{}".format(item.url)) xpath_obj = lxml.html.fromstring(item.content) # current_app.logger.info("开始抽取:{}".format(item.type)) if item.type == "1": try: tables = xpath_obj.xpath( "//div[@class='bd3']//div[@class='co_content8']/ul//table" ) print(len(tables)) for table in tables: # URL抽取 url = table.xpath(".//tr[2]//a/@href") if url: new_url = url[0] if not new_url.startswith("http"): new_url = dyttUrl + new_url tr_item = MyDict() tr_item.url = new_url tr_item.type = "2" title = table.xpath(".//tr[2]//a/text()") tr_item.title = title[0] if title else "None" name = table.xpath( ".//tr[2]//a/text()")[0] if table.xpath( ".//tr[2]//a/text()") else "《None》" tr_item.name = "《{}》".format("".join( re.findall(r"《(.*?)》", name))) date = table.xpath(".//tr[3]//font/text()") tr_item.date = date[0] if date else "None" introduction = table.xpath(".//tr[4]/td/text()") tr_item.introduction = introduction[ 0] if introduction else "None" self.push_item_in_redis_list( self.message_a, tr_item) logger.info("{}".format({ "url": tr_item.url[:30], "name": tr_item.name[:6], "title": tr_item.title[:6], "introduction": tr_item.introduction[:30], "type": tr_item.type[:6] })) except Exception as e: logger.error("列表页抽取: {}".format(e)) # current_app.logger.error("列表页抽取: {}".format(e)) elif item.type == "2": try: logger.info("开始抽取详情页信息:{}".format(item.url)) # current_app.logger.info("开始抽取详情页信息:{}".format(item.url)) down_lists = xpath_obj.xpath("//div[@id='downlist']/table") if not down_lists: down_lists = xpath_obj.xpath("//div[@id='Zoom']/table") # print("再次处理得到的连接:{}".format(len(down_lists))) # print("共有下载连接:{}".format(len(down_lists))) # magnet_info = {} magnet_info = [] for downloader in down_lists: magnet = downloader.xpath( ".//a/@href")[0] if downloader.xpath( ".//a/@href") else "" download_name = downloader.xpath( ".//a/text()")[0] if downloader.xpath( ".//a/text()") else "]" magnet_name = re.split(r"[=\]/]", download_name)[-1] # magnet_info.update({magnet_name: magnet}) magnet_info.append((magnet_name, magnet)) item.magnet = magnet_info self.push_item_in_redis_list(self.message_c, item) except Exception as e: logger.error("信息页抽取: {}".format(e)) # current_app.logger.error("信息页抽取: {}".format(e)) elif item.type == "0": try: area2s = xpath_obj.xpath( "//div[@class='bd2']/div[@class='index_list']/div[@class='co_area2']" ) for area in area2s: url = area.xpath( ".//div[@class='title_all']/p/span/a/@href") if url: new_url = url[0] if not new_url.startswith("http"): new_url = dyttUrl + new_url logger.info("首页抽取抽取到的网址:{}".format(new_url)) # current_app.logger.info("首页抽取抽取到的网址:{}".format(new_url)) # tr_item = DyHeavenCrawler.Default(new_url) tr_item = MyDict() tr_item.url = new_url tr_item.type = "1" tr_item.classify = area.xpath( ".//div[@class='title_all']/p/span/a/text()" )[0] # print("首页抽取抽取到的分类:{}".format(tr_item.classify)) self.push_item_in_redis_list( self.message_a, tr_item) except Exception as e: logger.error("首页抽取: {}".format(e)) # current_app.logger.error("首页抽取: {}".format(e)) except Exception as e: logger.error("primary: {}".format(e))