def download_latest_images_selenium(page, directory): """ 使用selenium获取 :param page: :param directory: :return: """ SystemUtil.restart_process(os.path.abspath(__file__)) driver = ReptileUtil.selenium_driver( "https://www.pexels.com/new-photos?page=" + str(page)) try: articles = driver.find_elements_by_tag_name("article") next_page = True try: driver.find_element_by_xpath( "/html/body/section/div[4]/div/a[@rel='next']") except Exception as e: next_page = False # 获取当前所有窗口句柄(窗口A、B) main_window = driver.current_window_handle print(articles) for article in articles: # 图片id image_id = article.get_attribute("data-photo-modal-medium-id") info_url = "https://www.pexels.com/photo/" + image_id # 通过执行js打开新标签页并访问url driver.execute_script(f"window.open('{info_url}')") driver.switch_to.window(driver.window_handles[-1]) tags = "" if driver.title.find("500") == -1: tags = driver.find_element_by_xpath( "//meta[@name='keywords']").get_attribute("content") tags = TranslationUtil.translate_google(tags).replace(",", ",") tags = re.sub(r"[^a-z,\u4e00-\u9fa5]+|^,|,$", "", tags).replace(",,", ",") # 关闭当前窗口。 driver.close() # 关闭新选项卡后回到主窗口,必须做这一步,否则会引发错误 driver.switch_to.window(main_window) # 图片下载链接 download_url = f"https://images.pexels.com/photos/{image_id}/pexels-photo-{image_id}.jpeg?dl={image_id}.jpg" s3.execute_commit(f""" INSERT OR IGNORE INTO images(image_id,suffix,url,type,page,tags) VALUES('{image_id}','jpg','{download_url}','latest','{page}','{tags}') """) image_name = f"pexels-photo-{image_id}.jpg" # 判断文件是否存在 if not os.path.exists(os.path.join(directory, image_name)): asyncio.run( HttpUtil.download_one_async(download_url, directory, image_name)) global run_count run_count += 1 # 如果获取到的页数大于0不是最后一页 if next_page and run_count <= 10: download_latest_images(page + 1, directory) else: if next_page: page += 1 else: page = 1 run_count = 0 except Exception as e: print(e) finally: # 关闭当前窗口。 driver.close() # 关闭浏览器并关闭chreomedriver进程 driver.quit() print("当前活跃线程数:", threading.activeCount()) time.sleep(400) download_latest_images(page, directory)
def download_latest_images(page, directory): try: SystemUtil.restart_process(os.path.abspath(__file__)) html = BeautifulSoup( HttpUtil.get("https://www.pexels.com/zh-cn/new-photos?page=" + str(page)).text, features="lxml") articles = html.find_all("article") pages_html = BeautifulSoup(str( html.find("div", {"class": "pagination"})), features="lxml").find_all("a") page_total = int(pages_html[len(pages_html) - 2].text) print(page, len(articles), page_total) if page > page_total: page = 1 raise ValueError("page超出范围") for article in articles: # 图片id image_id = article["data-photo-modal-medium-id"] # 图片原始大小 # image_org_size = article["data-photo-modal-download-value-original"] # 图片下载链接 download_url = article["data-photo-modal-image-download-link"] image_name = f"pexels-photo-{image_id}.jpg" info_html = BeautifulSoup( HttpUtil.get("https://www.pexels.com/zh-cn/photo/" + image_id).text, features="lxml") tags = info_html.find("meta", { "name": "keywords" }).attrs["content"] if len(tags) > 0 and tags != "": # 简繁转换 tags = zhconv.convert(tags[:len(tags) - 7], 'zh-cn') tags = re.sub(r"[^a-z,\u4e00-\u9fa5]+|^,|,$", "", tags).replace(",,", ",") s3.execute_commit(f""" INSERT OR IGNORE INTO images(image_id,suffix,url,type,page,tags) VALUES('{image_id}','{download_url[download_url.rfind(".") + 1:]}', '{download_url}','latest','{page}','{tags}') """) # dl = info_html.find(lambda tag: tag.has_attr('data-id') and tag.has_attr('href')).attrs["href"] # dl = info_html.find(lambda tag: tag.has_attr('data-id') and tag.has_attr('data-url')).attrs["data-url"] # 判断文件是否存在 if not os.path.exists(os.path.join(directory, image_name)): # 每张图片启用单个线程下载 # done = ThreadPool.pool.submit(HttpUtil.download_file, download_url, directory, image_name) # done.add_done_callback(ThreadPool.thread_call_back) asyncio.run( HttpUtil.download_one_async(download_url, directory, image_name)) global run_count run_count += 1 # 如果获取到的页数大于0不是最后一页 if page_total > 0 and page <= page_total and run_count <= 10: download_latest_images(page + 1, directory) else: if len(pages_html) > 0 and page <= page_total: page += 1 if page > page_total: page = 1 run_count = 0 except Exception as e: print(e) finally: print("当前活跃线程数:", threading.activeCount()) time.sleep(400) download_latest_images(page, directory)
def download_images(url, page, directory): """ 下载图片 :param url: 链接 :param page: 页 :param directory: 文件存放目录 :return: """ try: SystemUtil.restart_process(os.path.abspath(__file__)) html = BeautifulSoup(HttpUtil.get(url + str(page)).text, features="lxml") figure = html.find_all("figure") # 获取所有包含指定属性的标签 page_all = html.find_all(lambda tag: tag.has_attr('original-title')) page_total = int(page_all[len(page_all) - 1].text) print(page, len(figure), page_total) if page > page_total: page = 1 raise ValueError("page超出范围") for label in figure: image_id = label.attrs["data-wallpaper-id"] # 图片详情页 info_html = BeautifulSoup(HttpUtil.get("https://wallhaven.cc/w/" + image_id).text, features="lxml") tags_html = info_html.find_all("a", { "class": "tagname", "rel": "tag" }) # 图片的标签 tags = ",".join([tag_html.text for tag_html in tags_html]).replace("'", "") if len(tags) > 0 and tags != "": tags = TranslationUtil.translate_google(tags).replace(",", ",") tags = re.sub(r"[^a-z,\u4e00-\u9fa5]+|^,|,$", "", tags).replace(",,", ",") download_url = info_html.find("img", { "id": "wallpaper" }).attrs["src"] if len(download_url) <= 0 or download_url == "": raise ConnectionError("获取下载链接失败") s3.execute_commit(f""" INSERT OR IGNORE INTO images(image_id,suffix,url,type,page,tags) VALUES('{image_id}','{download_url[download_url.rfind(".") + 1:]}', '{download_url}','latest','{page}','{tags}') """) image_name = download_url.split("/") image_name = image_name[len(image_name) - 1] # 判断文件是否存在 # if not os.path.exists(name): if not os.path.isfile(os.path.join(directory, image_name)): # 每张图片启用单个线程下载 # done = ThreadPool.pool.submit(HttpUtil.download_file, download_url, directory, image_name) # done.add_done_callback(ThreadPool.thread_call_back) asyncio.run( HttpUtil.download_one_async(download_url, directory, image_name)) global run_count run_count += 1 # 如果获取到的页数大于0不是最后一页,并且内存占用率小于80%时 if len(page_all) > 0 and page <= page_total and run_count <= 10: download_images(url, page + 1, directory) else: if len(page_all) > 0: page += 1 if page > page_total: page = 1 run_count = 0 except Exception as e: print(e) finally: print("当前活跃线程数:", threading.activeCount()) time.sleep(400) download_images(url, page, directory)