示例#1
0
class MyThread(Thread):
    """扫描有效URL子线程"""
    def __init__(self, soup, thread_count):
        """
        子线程初始化方法

        :param soup: html 页面资源
        :param thread_count: 线程计数对象
        """
        super().__init__()  # 父类构造方法
        self._soup = soup  # 初始化参数和对象
        self._conn = MyDatabase()
        self._db = self._conn.database
        self._url_coll = UrlColl(self._db)
        self._thread_count = thread_count
        self._thread_id = str(randint(100, 1000))

    def run(self):
        """
        线程开启后执行的行为

        :return: None
        """
        print(self._thread_id + "线程开始:")
        # 调用方法获取有效URL
        get_useful_url(self._soup, self._thread_id, self._url_coll)
        self._thread_count.remove_one()
        print(self._thread_id + "线程退出:")
        self._conn.close_conn()
示例#2
0
def get_one_url():
    conn = MyDatabase()
    my_database = conn.database  # 初始化数据库对象,获取数据库对象
    url_coll = UrlColl(my_database)  # 获取URL集合
    url = url_coll.get_url()  # 从URL集合中获取一条URL数据
    conn.close_conn()
    return url
示例#3
0
    def __init__(self, thread_count):
        """
        初始化爬虫对象

        :param thread_count: 线程数量统计对象
        """
        self._conn = MyDatabase()
        self._db = self._conn.database
        self._book_coll = BookColl(self._db)  # 初始化对象
        self._url_coll = UrlColl(self._db)
        self._thread_count = thread_count
        self._error_log = ErrorLog()  # 新建错误日志输出记录对象
示例#4
0
    def __init__(self, soup, thread_count):
        """
        子线程初始化方法

        :param soup: html 页面资源
        :param thread_count: 线程计数对象
        """
        super().__init__()  # 父类构造方法
        self._soup = soup  # 初始化参数和对象
        self._conn = MyDatabase()
        self._db = self._conn.database
        self._url_coll = UrlColl(self._db)
        self._thread_count = thread_count
        self._thread_id = str(randint(100, 1000))
示例#5
0
 def __init__(self):
     self._database = MyDatabase().database
     self._url_coll = UrlColl(self._database)
示例#6
0
def main():
    my_database = MyDatabase().database
    book_coll = BookColl(my_database)
    result = book_coll.get_book_name()
    print(result)
    pass
示例#7
0
class Crawler(object):
    def __init__(self, thread_count):
        """
        初始化爬虫对象

        :param thread_count: 线程数量统计对象
        """
        self._conn = MyDatabase()
        self._db = self._conn.database
        self._book_coll = BookColl(self._db)  # 初始化对象
        self._url_coll = UrlColl(self._db)
        self._thread_count = thread_count
        self._error_log = ErrorLog()  # 新建错误日志输出记录对象

    def get_book(self, url):
        """
        获取书籍数据

        :param url: 获取书籍的URL地址
        :return: None
        """
        book = {}  # 初始化字典 用于保存数据
        # 初始化浏览器驱动程序,获得浏览器驱动对象
        driver = webdriver.Firefox(
            executable_path='E:\DevelopTools\Python\geckodriver')
        # driver = webdriver.Ie(executable_path='E:\DevelopTools\Python\IEDriverServer')
        try:
            driver.set_page_load_timeout(12)  # 设置页面加载超时时间
            driver.set_script_timeout(30)  # 设置页面脚本响应超时时间
            driver.get(url)  # 设置浏览器获取页面的地址
            js = "var q=document.documentElement.scrollTop=100000"  # 浏览器执行的js代码 向下滑动100000xp
            driver.execute_script(js)  # 运行脚本
            time.sleep(1)  # 休眠等待浏览器执行
            js = "var q=document.documentElement.scrollTop=0"  # 浏览器js代码 回到顶部
            driver.execute_script(js)  # 运行脚本
            time.sleep(2)  # 休眠等待浏览器执行
            js = "var q=document.documentElement.scrollTop=100000"  # 浏览器js代码, 回到底部
            driver.execute_script(js)  # 运行脚本
            time.sleep(1)  # 休眠等待浏览器执行, 模拟浏览器滑动完成
            soup = BeautifulSoup(driver.page_source,
                                 "lxml")  # 传递页面数据, 初始化bs4对象
        except Exception as e:
            print(e)  # 输出错误信息
            self._error_log.write_error(e)  # 记录错误信息
            return  # 返回空
        finally:
            driver.close()  # 关闭浏览器
        # target = driver.find_element_by_id("footer")
        # driver.execute_script("arguments[0].scrollIntoView();", target)  # 拖动到可见的元素去

        # 下面是相关标签的数据获取
        null_wrap = soup.find("div", {"class": "null_wrap"})
        if not null_wrap is None:
            self._url_coll.update_url(url)
            return
        book['url'] = url
        book_name = soup.find("div", {"class": "name_info"})
        if book_name is None:
            self._url_coll.update_url(url)
            return
        book['book_name'] = book_name.h1.get_text(strip=True)
        book['image_url'] = soup.find("div", {"class": "big_pic"}).img['src']
        book['book_type'] = soup.find("div", {
            "class": "breadcrumb"
        }).get_text(strip=True)
        book['introduction'] = soup.find("span", {
            "class": "head_title_name"
        }).get_text(strip=True)
        author = soup.find("span", {"id": "author"})
        if author is None:
            book['author'] = ""
        else:
            book['author'] = soup.find("span", {"id": "author"}).text
        messbox = soup.find("div", {"class": "messbox_info"})
        for item in messbox:
            if "出版社" in str(item):
                book['publishing'] = item.get_text(strip=True)
            elif "出版时间" in str(item):
                book['publishing_time'] = item.get_text(strip=True)
        book['price'] = soup.find("p", {
            "id": "dd-price"
        }).get_text(strip=True).split("¥")[1]
        editors_choice = soup.find("div", {"id": "abstract"})
        if editors_choice is None:
            book['editors_choice'] = ""
        else:
            book['editors_choice'] = editors_choice.contents[1].get_text()
        content_validity = soup.find("div", {"id": "content"})
        if content_validity is None:
            book['content_validity'] = ""
        else:
            book['content_validity'] = content_validity.contents[1].get_text()
        about_author = soup.find("div", {"id": "authorIntroduction"})
        if about_author is None:
            book['about_author'] = ""
        else:
            book['about_author'] = about_author.contents[1].get_text()
        catalog = soup.find("textarea", {"id": "catalog-textarea"})
        if catalog is None:
            catalog2 = soup.find("div", {"id": "catalog"})
            if catalog2 is None:
                book['catalog'] = ""
            else:
                book['catalog'] = catalog2.contents[1].get_text()
        else:
            book['catalog'] = catalog.get_text(strip=True)
        media_reviews = soup.find("div", {"id": "mediaFeedback"})
        if media_reviews is None:
            book['media_reviews'] = ""
        else:
            book['media_reviews'] = media_reviews.get_text()
        # 数据获取成功,插入book集合
        self._book_coll.insert_to_db(book)
        self._conn.close_conn()
        print(url + "完成")
        try:
            self._thread_count.add_one()  # 线程计数加一
            thread = MyThread(soup, self._thread_count)  # 创建线程对象
            thread.start()  # 开启线程
        except Exception as e:
            self._error_log.write_error(e)  # 写入错误日志
            print("Error: 无法启动线程" + e)