class MvList(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() @classmethod def run(cls): """ start get movie list :return: """ mv_list_thread = MvListThread() mv_list_thread.run() def get_mv_list(self): """ :return: """ select_arr = { "table": "list" } mv_list = self.db.select(select_arr, is_close_db=False) return mv_list
class MvImg(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def run(self): self.handle_data() def handle_data(self): data = self.get_img_list() thread_pool = ThreadPoolExecutor(max_workers=15) task_list = list() for item in data: task = thread_pool.submit(self.__handle_data, item) task_list.append(task) # break for i in as_completed(task_list): result = i.result() def __handle_data(self, item): url = "https://www.pelisplay.tv" + item['img_src'] header = { # "Referer": "https://www.pelisplay.tv/", "User-Agent": getUserAgent(), "Accept": "image/webp,image/apng,image/*,*/*;q=0.8" } data = curlData(url, header=header) with open("static/images/{id}.jpg".format(id=item['id']), "wb") as f: try: data = data.encode("utf-8") except Exception as e: debug(e) f.write(data) self.__update_data(item) f.close() return {"code": 0} def __update_data(self, item): update_arr = { "table": "list", "set": { "img_status": 1 }, "condition": ['id={id}'.format(id=item['id'])] } lock.acquire() result = self.db.update(update_arr, is_close_db=False) lock.release() return result def get_img_list(self): select_arr = { "table": "list", "columns": ["id", "img_src"], "condition": ['img_status=0'] } data = self.db.select(select_arr, is_close_db=False) return data
class MvListData(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def run(self): self.handle() def handle(self): data = self.get_data() for item in data: self.__mv_data(item) def __mv_data(self, item): select_arr = { "table": "list", "condition": ["id={id}".format(id=item['list_id'])] } try: data = self.db.select(select_arr, is_close_db=False)[0] except Exception as e: debug(e) return sql = self.db.getInsertSql(data, "tmp_list") debug(sql) result = self.db.insert(sql, is_close_db=False) return result def get_data(self): select_arr = { "table": "tmp_content", } data = self.db.select(select_arr, is_close_db=False) return data
class ClearNullData(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def run(self): self.handle() def handle(self): data = self.get_data() debug(data) self.del_list(data) self.del_content(data) def del_list(self, data): for item in data: self.__del_list(item) def __del_list(self, item): delete_arr = { "table": "list", "condition": ["id={id}".format(id=item['parent_id'])] } result = self.db.delete(delete_arr, is_close_db=False) return result def del_content(self, data): for item in data: self.__del_content(item) def __del_content(self, item): delete_arr = { "table": "content", "condition": ["parent_id={id}".format(id=item['parent_id'])] } result = self.db.delete(delete_arr, is_close_db=False) return result def get_data(self): select_arr = { "table": "content", "limit": [0, 10], "condition": ["video_src=''", "and", "url=''"] } data = self.db.select(select_arr, is_close_db=False) return data
class GetImgUrlLarge(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def run(self): self.handle() def handle(self): data = self.get_data() for item in data: result = self.__handle(item) if result['result_1'] == 1 and result['result_2'] == 1 and result[ 'result_3'] == 1: debug(item['img_url']) else: break def __handle(self, item): img_url = item['img_url'] try: s = re.findall('squarethumbnails\/([\w\W]*.)', img_url)[0] except Exception as e: s = '' debug(e) if s == '': return s = 'http://www.laurainthekitchen.com/largethumbnails/' + s result = self.__update_data(s, item) return result def __update_data(self, s, item): update_arr_list = { "table": "list", "set": { "img_url_large": s, "status": 1 }, "condition": ["id={id}".format(id=item['id'])] } result_1 = self.db.update(update_arr_list, is_close_db=False) del update_arr_list['set']['status'] update_arr_list['table'] = "tmp_list" result_2 = self.db.update(update_arr_list, is_close_db=False) update_arr_list['table'] = "content" update_arr_list['condition'] = ["list_id={id}".format(id=item['id'])] result_3 = self.db.update(update_arr_list, is_close_db=False) return { "result_1": result_1, "result_2": result_2, "result_3": result_3 } def get_data(self): select_arr = { "table": "list", "columns": ["img_url", "id"], "condition": ["status=0"] } data = self.db.select(select_arr, is_close_db=False) return data
class GetImages(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def run(self): self.handle() def handle(self): data = self.get_data() self.get_images(data, "large", img_url="img_url_large") @classmethod def start_thread(cls, data, fun, path, img_url, prefix): thread_pool = ThreadPoolExecutor(max_workers=15) task_list = list() result = list() for item in data: task = thread_pool.submit(fun, item, path, img_url, prefix) task_list.append(task) for i in as_completed(task_list): result.append(i.result()) return result def get_images(self, data, path, img_url, prefix=""): self.start_thread(data, self.__get_images, path, img_url, prefix) def __get_images(self, item, path, img_url, prefix): page_resource = self.get_page_resource(prefix + item[img_url]) with open( "static/images/{path}/{id}.jpg".format(path=path, id=item['id']), "wb") as f: try: page_resource = page_resource.encode("utf-8") except Exception as e: debug(e) f.write(page_resource) f.close() update_data = {"status": 1} condition = ["id={id}".format(id=item['id'])] self.__update_data(update_data, "list", condition) @classmethod def get_page_resource(cls, url): data = curlData(url, open_virtual_ip=True) return data def __update_data(self, update_data, table, condition): update_arr = { "table": table, "set": update_data, "condition": condition } lock.acquire() self.db.update(update_arr, is_close_db=False) lock.release() def get_data(self): data = self.db.select( { "table": "list", "columns": ["id", "img_url", "img_url_large"], "condition": ["status=0"] }, is_close_db=False) return data
class GetVideoSrc(object): def __init__(self): self.cookie = {} # self.get_cookie() self.db = DBConfig() def __del__(self): self.db.closeDB() def run(self): data = self.get_content_list() self.handle_data(data) def handle_data(self, data): thread_pool = ThreadPoolExecutor(max_workers=15) task_list = list() for item in data: if item['url'] == '': continue else: task = thread_pool.submit(self.__handle_data, item) task_list.append(task) # self.__handle_data(item) for i in as_completed(task_list): result = i.result() def __handle_data(self, item): update_data = dict() update_data['status'] = 1 update_data['video_src'] = self.__get_video_src(item) debug(update_data['video_src']) self.__update_data(item['id'], update_data) return {"code": 0} def __get_video_src(self, item): header = { # "Referer": "http://www.wyysdsa.com/", "User-Agent": getUserAgent(), # "Cache-Control": "max-age=0", # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3" } # url = "http://zeus.pelisplay.tv/embed/vip.php?u=Q1A5NUZJM1VDTWlUTk8wTEFmWGNQZDhnbWRIcmt6UVU0VGIxakpXOUF4Mi9yZW51Zi9yaXZlcXFoYnlwL3picC5hYm1uem4uampqLy86ZmNnZ3U&fondo_requerido=" # url = "https://nl.tan90.club/test/testHeader.html" data = curlData(url=item['url'], header=header, cookie=self.cookie) # with open("tmp/content_detail.txt", "rb") as f: # data = f.read().decode("utf-8") # f.close() try: src = re.findall("JSON\.parse\('([\w\W]*?)'\)\);", data)[0] src = src.replace("\\", "") src = json.loads(src) src = src[0]['file'] except Exception as e: src = "" debug(e) return src def __update_data(self, content_id, update_data): update_arr = { "table": "content", "set": update_data, "condition": ['id={content_id}'.format(content_id=content_id)] } lock.acquire() result = self.db.update(update_arr, is_close_db=False) lock.release() return result def get_content_list(self): data = self.db.select({ "table": "content", "columns": ['id', 'url'], "condition": ['status=0'] }, is_close_db=False) return data def get_cookie(self): header = { "User-Agent": getUserAgent(), # "Cache-Control": "max-age=0", # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3" } url = "https://www.pelisplay.tv/" self.cookie = getCookie(url, header=header) debug(self.cookie)
class RecipeType(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def get(self): """ :return: """ select_arr = {"table": "type", "condition": ['nav_type=2']} data = self.db.select(select_arr, is_close_db=False) # check whether have any data, if not, get all category if not data: self.get_recipe_type() data = self.db.select(select_arr, is_close_db=False) return data def get_category(self): """ :return: """ select_arr = {"table": "type", "condition": ['status=0']} return self.db.select(select_arr, is_close_db=False) def get_recipe_type(self): """ :return: """ category_li = self.__handle_category() self.__handle_category_data(category_li) @classmethod def __handle_category(cls): """ :return: """ url = CommonFunc().generate_url() page_resource = curlData(url, open_virtual_ip=True) bs_data = BeautifulSoup(page_resource, "html.parser") category_ul = bs_data.find_all("ul", attrs={"class": "sub-menu"}) # only get the next level's li(tag), not include offspring(need to add 'recursive=False') return category_ul[0].find_all("li", recursive=False) def __handle_category_data(self, category_li, handle_type=1, parent_id=0): """ :param category_li: :param handle_type: :param parent_id: :return: """ table_columns = (("id", "int"), ("name", "varchar"), ("page_num", "longtext"), ("nav_type", "int"), ("keyword", "varchar"), ("parent_id", "int")) for item in category_li: insert_arr = {"parent_id": 0, "nav_type": 2} try: href = item.find("a").attrs['href'] try: insert_arr['keyword'] = re.findall('category=([\w\W]*.)', href)[0] except Exception as e: debug(e) if handle_type == 2: insert_arr['parent_id'] = parent_id if href == "#": insert_arr['name'] = item.find("span").getText().strip() insert_arr['nav_type'] = 1 sql = self.db.getInsertSql(insert_arr, "type", table_columns=table_columns) lastest_id = self.db.insertLastId(sql, is_close_db=False) if lastest_id == 0: debug("get data error") continue self.__handle_category_data(item.find_all("li"), 2, lastest_id) else: insert_arr['name'] = item.getText().strip() sql = self.db.getInsertSql(insert_arr, "type", table_columns=table_columns) self.db.insert(sql, is_close_db=False) except Exception as e: debug(e)
class MvCategory(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def get_category(self): """ :return: """ select_arr = {"table": "type"} category = self.db.select(select_arr, is_close_db=False) if not category: return [] return category def get(self): """ :return: """ # page_resource = self.get_data() with open("tmp/index_page.txt", "rb") as f: page_resource = f.read().decode("utf-8") f.close() bs = BeautifulSoup(page_resource, "html.parser") category_list = self.__get_category_list(bs) for item in category_list: self.handle_data(item) def handle_data(self, item): """ :param item: :return: """ insert_arr = dict() insert_arr['status'] = 0 insert_arr['url'] = self.__get_category_url(item) insert_arr['img_src'] = self.__get_category_img_src(item) insert_arr['icon_img_src'] = self.__get_category_icon_img_src(item) insert_arr['name'] = self.__get_category_name(item) insert_arr['description'] = self.__get_category_description(item) if self.__save_date(insert_arr): debug("类型存储成功") else: debug("类型存储失败") def __save_date(self, insert_arr): """ :param insert_arr: :return: """ table_columns = (("id", "int"), ("img_src", "varchar"), ("icon_img_src", "varchar"), ("url", "varchar"), ("name", "varchar"), ("description", "text")) sql = self.db.getInsertSql(insert_arr, table="type", table_columns=table_columns) result = self.db.insert(sql, is_close_db=False) if result == 0: return False return True @classmethod def get_data(cls): """ :return: """ url = settings.DOMAIN data = curlData(url, open_virtual_ip=True) return data @classmethod def __get_category_list(cls, bs): """ :param bs: :return: """ category_list = bs.find_all("ul", attrs={"class": "owl-carousel"}) try: category_list = category_list[0].find_all("li", attrs={"class": "item"}) except Exception as e: category_list = list() debug("类型列表获取失败,错误信息:{error}".format(error=e)) return category_list @classmethod def __get_category_url(cls, item): """ :param item: :return: """ category_url = item.find("a") try: category_url = category_url.attrs['href'] except Exception as e: category_url = "" debug("分类url链接获取失败,错误信息:{error}".format(error=e)) return category_url @classmethod def __get_category_img_src(cls, item): """ :param item: :return: """ category_img_src = item.find("img") try: category_img_src = category_img_src.attrs['src'] except Exception as e: category_img_src = "" debug("图片地址获取失败,错误信息:{error}".format(error=e)) return category_img_src @classmethod def __get_category_icon_img_src(cls, item): """ get icon img src :param item: :return: """ category_icon_img_src = item.find("img") try: category_icon_img_src = category_icon_img_src.attrs['src'] except Exception as e: category_icon_img_src = "" debug("icon图片地址获取失败,错误信息:{error}".format(error=e)) return category_icon_img_src @classmethod def __get_category_name(cls, item): """ get category name :param item: :return: """ category_name = item.find("div", attrs={"class": "category-name"}) try: category_name = category_name.get_text().strip() except Exception as e: category_name = "" debug("类型名获取失败,错误信息:{error}".format(error=e)) return category_name @classmethod def __get_category_description(cls, item): """ :param item: :return: """ category_description = item.find( "div", attrs={"class": "category-description"}) try: category_description = category_description.get_text().strip() except Exception as e: category_description = "" debug("类型描述获取失败,错误信息:{error}".format(error=e)) return category_description
class GetRecipeVideo(object): def __init__(self): self.db = DBConfig() def __del__(self): self.db.closeDB() def run(self): self.download() def download(self): data = self.get_tmp_content() self.__download(data) def __download(self, data): for item in data: url = "https://www.youtube.com/watch?v=%s" % item['video_id'] debug("开始抓取:--> {video_id}".format(video_id=item['video_id'])) try: youtube = YouTube(url) youtube.streams.filter(subtype="mp4").first().download( "/Users/cpx/code/py/recipe/data/recipe/", filename=item['video_id']) self.__update_data(item['id']) except Exception as e: debug(e) def __update_data(self, list_id): """ :param list_id: :return: """ update_arr = { "table": "tmp_content", "set": { "status": 1 }, "condition": ['id={list_id}'.format(list_id=str(list_id))] } result = self.db.update(update_arr, is_close_db=False) return result def get_tmp_content(self): data = self.db.select( { "table": "tmp_content", "columns": ['id', 'video_id'], "condition": ['status=0'] }, is_close_db=False) return data def handle_data(self): self.move_data() # data = self.get_data() def get_data(self): select_arr = {"table": "recipe_content"} data = self.db.select(select_arr, is_close_db=False) return data def move_data(self): category = self.get_category() for item in category: data = self.get_list_by_type_id(item['id']) self.__move_data(data) def __move_data(self, data): for item in data: content = self.get_content_by_list_id(item['id']) try: content = content[0] content['status'] = 0 self.__insert_data(content) except Exception as e: debug(e) def __insert_data(self, insert_arr): sql = self.db.getInsertSql(insert_arr, "tmp_content") result = self.db.insert(sql, is_close_db=False) return result def get_list_by_type_id(self, type_id): data = self.db.select( { "table": "list", "condition": ['recipe_type_id={type_id}'.format(type_id=type_id)], "limit": [0, 20] }, is_close_db=False) return data def get_content_by_list_id(self, list_id): data = self.db.select( { "table": "content", "columns": ['video_id', 'list_id'], "condition": ["list_id={list_id}".format(list_id=str(list_id))] }, is_close_db=False) return data def get_category(self): data = self.db.select({ "table": "type", "condition": ['keyword<>""'] }, is_close_db=False) return data
class RecipeListSpider(object): def __init__(self): self.db = DBConfig() self.recipe_type = RecipeType() def __del__(self): self.db.closeDB() def run(self): """ start get recipe list :return: """ self.get_recipe_list() def get_list(self, condition=[], limit=[]): """ :return: """ select_arr = {"table": "list", "condition": ['status=0']} data = self.db.select(select_arr, is_close_db=False) return data def get_category(self): """ get a category's all page num :return: """ return self.recipe_type.get_category() def get_recipe_list(self): """ :return: """ self.__get_recipe_list() def __get_recipe_list(self): """ :return: """ info = self.get_category() for item in info: self.__get_recipe_list_child(item) def __set_status(self, category_id): """ :param category_id: :return: """ update_arr = { "table": "type", "set": { "status": 1 }, "condition": ['id={category_id}'.format(category_id=category_id)] } result = self.db.update(update_arr, is_close_db=False) if result == 0: debug("更新状态出错, 出错原因:unknown") return def __get_recipe_list_child(self, info): """ :param info: :return: """ try: page_list = json.loads(info['page_num'])['page_list'] except Exception as e: debug(e) self.__set_status(info['id']) return category = info['keyword'] if category == "": self.__set_status(info['id']) return page_list = page_list.split(",") recipe_list_thread = RecipeListThread(page_list, info) recipe_list_thread.run() self.__set_status(info['id'])
class GetConstitutionList(object): def __init__(self): # 数据库连接全局变量 # self.ws_db = phoenix_db.DBConfig() self.count = 0 self.db = DBConfig() def __del__(self): self.db.closeDB() debug("本次一共获取到了%s条数据" % str(self.count)) # self.ws_db.closeDB() def getAllConstitutionStart(self): try: record = self.db.select({"table": "constitutions_record", "condition": ['is_over=0']}, is_close_db=False) next_page = record[0]['page'] except: next_page = 1 while True: try: data = self.getConstitutionList(next_page) except: debug("内容获取出错,重新获取") continue # 获取下一页的页码 try: tmpNextPage = re.findall('href="javascript:toUpDownPage\(\'(\d+)\'\);">下一页<\/a>', data)[0] debug("当前的页码是:%s" % str(next_page)) debug("获取到的下一页页码是:%s" % str(tmpNextPage)) except: debug("下一页的页码获取出错") break self.getAllConstitutionHandle(data, "北京") updatetArr = { "table": "constitutions_record", "condition": ['id=2'], "set": { "page": tmpNextPage, "is_over": 0 } } self.db.update(updatetArr, is_close_db=False) if int(next_page) >= int(tmpNextPage): break else: next_page = tmpNextPage debug("本次抓取完毕") updatetArr = { "table": "constitutions_record", "condition": ['id=2'], "set": { "is_over": 1 } } self.db.update(updatetArr, is_close_db=False) def getAllConstitution(self, fun): """ 获取所有法律法规 :param fun: :return: """ url = "http://210.82.32.100:8081/FLFG/" dcap = dict(DesiredCapabilities.FIREFOX) ip = virtualIp() dcap['phantomjs.page.customHeaders.X-FORWARDED-FOR'] = ip dcap['phantomjs.page.customHeaders.CLIENT-IP'] = ip firefox_options = Options() firefox_options.add_argument("--headless") firefox_options.add_argument('--disable-gpu') driver = webdriver.Firefox(firefox_options=firefox_options, desired_capabilities=dcap) driver.get(url) sleep(3) cloumn = driver.find_elements_by_class_name("cloumn") try: cloumn = cloumn[3] except: while True: try: cloumn = cloumn[3] break except: sleep(1) cloumntitle = cloumn.find_elements_by_class_name("threecloumntitle") cloumntitleLength = len(cloumntitle) current_handle = driver.current_window_handle for i in range(cloumntitleLength): try: list_a = cloumntitle[i].find_elements_by_tag_name("a") except: list_a = list() debug("省份列表获取出错") list_a_len = len(list_a) for k in range(list_a_len): # 获取省份名 try: province = list_a[k].text except: debug("省份获取出错,继续执行,省份标记锚点为" + str(k)) province = str(k) debug(province + ":") try: list_a[k].click() except: debug("点击失败") sleep(3) all_handles = driver.window_handles sleep(3) for handle in all_handles: if handle != current_handle: driver.switch_to_window(handle) sleep(1) data = driver.page_source htmlData = BeautifulSoup(data, "html.parser") try: url = htmlData.find_all("iframe", attrs={"id": "rightpage"})[0].attrs['src'] url = re.sub("(有效)", "有效,已被修正,失效", url) driver.execute_script("location.href='" + url + "'") sleep(3) # 进行点击50篇每页 try: driver.find_element_by_id("span_pagesize_50").click() sleep(3) except: pass data = driver.page_source # 获取每一页的text以便稍后判断 try: nextPage = re.findall(r'<a[\w\W]*?href="([\w\W]*?)">下一页', data) nextPage = re.findall(r'<a[\w\W]*?href="([\w\W]*?)\)', nextPage[0]) nextPage = re.findall("(\d+)", nextPage[1]) nextPage = nextPage[0] except: nextPage = 0 # 处理数据 while True: tmpPage = int(nextPage) - 1 debug("第" + str(tmpPage) + "页:") fun(data, province) nextPageElement = driver.find_element_by_class_name("td") try: nextPageElement = nextPageElement.find_elements_by_tag_name("a")[1] nextPageElement.click() sleep(3) data = driver.page_source try: tmpNextPage = re.findall(r'<a[\w\W]*?href="([\w\W]*?)">下一页', data) tmpNextPage = re.findall(r'<a[\w\W]*?href="([\w\W]*?)\)', tmpNextPage[0]) tmpNextPage = re.findall("(\d+)", tmpNextPage[1]) tmpNextPage = tmpNextPage[0] if nextPage == tmpNextPage: break else: nextPage = tmpNextPage except: break except: break # 点击下一页 except Exception as e: debug(e) debug("") driver.close() sleep(1) driver.switch_to_window(all_handles[0]) sleep(2) driver.quit() def getConstitutionList(self, cur_page): url = "http://210.82.32.100:8081/FLFG/flfgGjjsAction.action" referer = "http://210.82.32.100:8081/FLFG/flfgGjjsAction.action" post = { "pagesize": "20", "pageCount": "500", "curPage": cur_page, "resultSearch": "false", # "lastStrWhere": "+SFYX:(有效)++^+ZLSX:(01~02~03~04~05~06~08~09~10~11~12~23)+NOT+TXTID=bj+^+SFFB=Y+", "lastStrWhere": " SFYX:(有效~已被修正~失效) ^(ZLSX:1111 ~ZLSX=01) ^ BMFL:(03) ^ SFFB=Y ", "bt": "", "flfgnr": "", "sxx": "有效,已被修正,失效", # "sxx": "有效", "zlsxid": "12", "bmflid": "", "xldj": "", "bbrqbegin": "2018-09-01", "bbrqend": "2018-12-17", "sxrqbegin": "", "sxrqend": "", "zdjg": "", "bbwh": "" } header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko" } data = curlData(url=url, value=post, referer=referer, header=header) return data def getAllConstitutionHandle(self, data, province): data = re.findall(r'<a[\w\W]*?href="javascript:showLocation([\w\W]*?);"', data) old1 = "" old5 = "" i = 0 thread_list = list() for k, v in enumerate(data): data[k] = tuple(v.split("'")) try: if data[k][1] == old1 and data[k][7] == old5: continue i = i + 1 thread_list.append(ConstitutionThread(data[k][1], data[k][7], data[k][3], province, i)) old1 = data[k][1] old5 = data[k][7] except: pass i = len(thread_list) for m in range(i): thread_list[m].start() for m in range(i): thread_list[m].join() i = thread_list[0].getRv() # 重置计数器 thread_list[0].reset() self.count = self.count + i return 1 def getConstitutionData(self, flfgID, zlsxid, showDetailType, province): # 经过浏览。很明显,具体的宪法数据源url为如下的url,包含两个get类型参数 flfgID zlsxid keyword 前两个是必须的,通过列表传递的js数据拿到 flag = False url = "http://210.82.32.100:8081/FLFG/flfgByID.action" get = dict() get['flfgID'] = flfgID get['showDetailType'] = showDetailType get['zlsxid'] = zlsxid get['keyword'] = "" get = urlencode(get) url = url + "?" + get while True: try: data = curlData(url, get, url) break except: pass try: data = data.decode("utf-8") except: pass # with open("constitution.txt", "wb") as f: # f.write(data.encode("utf-8")) # f.close() # with open("constitution.txt", "rb") as f: # data = f.read().decode("utf-8") # f.close() handleDataAll = BeautifulSoup(data, "html.parser") handleData = handleDataAll.find_all("table") columns_list = ['type', "department_type", 'office', 'reference_num', 'issue_date', 'execute_date', 'timeliness'] columns_name_list = ['资料属性:', '部门分类:', '制定机关:', '颁布文号:', '颁布日期:', '施行日期:', '时 效 性:'] # 获取头部基本信息 try: table_data = handleData[0].find_all("td") except: table_data = "数据获取出错" flag = True type_data = dict() type_data['url'] = url for k, v in enumerate(table_data): try: if (k + 1) % 2 == 1: type_data[columns_list[columns_name_list.index(table_data[k].getText().strip())]] = table_data[ k + 1].getText().strip() except: type_data[columns_list[columns_name_list.index(table_data[k].getText().strip())]] = "数据获取出错" # 接下来获取标题和内容 try: type_data['title'] = handleDataAll.find_all("div", attrs={"class": "bt"})[0].getText().strip() except: type_data['title'] = "标题获取出错" flag = True # 进行内容获取 try: type_data['content'] = str(handleDataAll.find_all("div", attrs={"id": "content"})[0]) except: flag = True type_data['province'] = province if flag: type_data['is_get_error'] = 1 else: type_data['is_get_error'] = 0 while True: try: sql = self.db.getInsertSql(type_data, "constitutions") result = self.db.insert(sql, is_close_db=False) break except Exception as e: debug(e) return result