class MyThread(Thread): """扫描有效URL子线程""" def __init__(self, soup, thread_count): """ 子线程初始化方法 :param soup: html 页面资源 :param thread_count: 线程计数对象 """ super().__init__() # 父类构造方法 self._soup = soup # 初始化参数和对象 self._conn = MyDatabase() self._db = self._conn.database self._url_coll = UrlColl(self._db) self._thread_count = thread_count self._thread_id = str(randint(100, 1000)) def run(self): """ 线程开启后执行的行为 :return: None """ print(self._thread_id + "线程开始:") # 调用方法获取有效URL get_useful_url(self._soup, self._thread_id, self._url_coll) self._thread_count.remove_one() print(self._thread_id + "线程退出:") self._conn.close_conn()
def get_one_url(): conn = MyDatabase() my_database = conn.database # 初始化数据库对象,获取数据库对象 url_coll = UrlColl(my_database) # 获取URL集合 url = url_coll.get_url() # 从URL集合中获取一条URL数据 conn.close_conn() return url
def __init__(self, thread_count): """ 初始化爬虫对象 :param thread_count: 线程数量统计对象 """ self._conn = MyDatabase() self._db = self._conn.database self._book_coll = BookColl(self._db) # 初始化对象 self._url_coll = UrlColl(self._db) self._thread_count = thread_count self._error_log = ErrorLog() # 新建错误日志输出记录对象
def __init__(self, soup, thread_count): """ 子线程初始化方法 :param soup: html 页面资源 :param thread_count: 线程计数对象 """ super().__init__() # 父类构造方法 self._soup = soup # 初始化参数和对象 self._conn = MyDatabase() self._db = self._conn.database self._url_coll = UrlColl(self._db) self._thread_count = thread_count self._thread_id = str(randint(100, 1000))
def __init__(self): self._database = MyDatabase().database self._url_coll = UrlColl(self._database)
def main(): my_database = MyDatabase().database book_coll = BookColl(my_database) result = book_coll.get_book_name() print(result) pass
class Crawler(object): def __init__(self, thread_count): """ 初始化爬虫对象 :param thread_count: 线程数量统计对象 """ self._conn = MyDatabase() self._db = self._conn.database self._book_coll = BookColl(self._db) # 初始化对象 self._url_coll = UrlColl(self._db) self._thread_count = thread_count self._error_log = ErrorLog() # 新建错误日志输出记录对象 def get_book(self, url): """ 获取书籍数据 :param url: 获取书籍的URL地址 :return: None """ book = {} # 初始化字典 用于保存数据 # 初始化浏览器驱动程序,获得浏览器驱动对象 driver = webdriver.Firefox( executable_path='E:\DevelopTools\Python\geckodriver') # driver = webdriver.Ie(executable_path='E:\DevelopTools\Python\IEDriverServer') try: driver.set_page_load_timeout(12) # 设置页面加载超时时间 driver.set_script_timeout(30) # 设置页面脚本响应超时时间 driver.get(url) # 设置浏览器获取页面的地址 js = "var q=document.documentElement.scrollTop=100000" # 浏览器执行的js代码 向下滑动100000xp driver.execute_script(js) # 运行脚本 time.sleep(1) # 休眠等待浏览器执行 js = "var q=document.documentElement.scrollTop=0" # 浏览器js代码 回到顶部 driver.execute_script(js) # 运行脚本 time.sleep(2) # 休眠等待浏览器执行 js = "var q=document.documentElement.scrollTop=100000" # 浏览器js代码, 回到底部 driver.execute_script(js) # 运行脚本 time.sleep(1) # 休眠等待浏览器执行, 模拟浏览器滑动完成 soup = BeautifulSoup(driver.page_source, "lxml") # 传递页面数据, 初始化bs4对象 except Exception as e: print(e) # 输出错误信息 self._error_log.write_error(e) # 记录错误信息 return # 返回空 finally: driver.close() # 关闭浏览器 # target = driver.find_element_by_id("footer") # driver.execute_script("arguments[0].scrollIntoView();", target) # 拖动到可见的元素去 # 下面是相关标签的数据获取 null_wrap = soup.find("div", {"class": "null_wrap"}) if not null_wrap is None: self._url_coll.update_url(url) return book['url'] = url book_name = soup.find("div", {"class": "name_info"}) if book_name is None: self._url_coll.update_url(url) return book['book_name'] = book_name.h1.get_text(strip=True) book['image_url'] = soup.find("div", {"class": "big_pic"}).img['src'] book['book_type'] = soup.find("div", { "class": "breadcrumb" }).get_text(strip=True) book['introduction'] = soup.find("span", { "class": "head_title_name" }).get_text(strip=True) author = soup.find("span", {"id": "author"}) if author is None: book['author'] = "" else: book['author'] = soup.find("span", {"id": "author"}).text messbox = soup.find("div", {"class": "messbox_info"}) for item in messbox: if "出版社" in str(item): book['publishing'] = item.get_text(strip=True) elif "出版时间" in str(item): book['publishing_time'] = item.get_text(strip=True) book['price'] = soup.find("p", { "id": "dd-price" }).get_text(strip=True).split("¥")[1] editors_choice = soup.find("div", {"id": "abstract"}) if editors_choice is None: book['editors_choice'] = "" else: book['editors_choice'] = editors_choice.contents[1].get_text() content_validity = soup.find("div", {"id": "content"}) if content_validity is None: book['content_validity'] = "" else: book['content_validity'] = content_validity.contents[1].get_text() about_author = soup.find("div", {"id": "authorIntroduction"}) if about_author is None: book['about_author'] = "" else: book['about_author'] = about_author.contents[1].get_text() catalog = soup.find("textarea", {"id": "catalog-textarea"}) if catalog is None: catalog2 = soup.find("div", {"id": "catalog"}) if catalog2 is None: book['catalog'] = "" else: book['catalog'] = catalog2.contents[1].get_text() else: book['catalog'] = catalog.get_text(strip=True) media_reviews = soup.find("div", {"id": "mediaFeedback"}) if media_reviews is None: book['media_reviews'] = "" else: book['media_reviews'] = media_reviews.get_text() # 数据获取成功,插入book集合 self._book_coll.insert_to_db(book) self._conn.close_conn() print(url + "完成") try: self._thread_count.add_one() # 线程计数加一 thread = MyThread(soup, self._thread_count) # 创建线程对象 thread.start() # 开启线程 except Exception as e: self._error_log.write_error(e) # 写入错误日志 print("Error: 无法启动线程" + e)