def __init__(self): self.downloader = aszwDownloader.Downloader() self.parser = aszwParser.Parser() # cookie库 self.cookies = dbController.dbc('bookwarehouse').getCookies() # 代理库 self.proxies = proxies.get_proxy('http://www.xicidaili.com/nn/', {'User-agent': 'Mr.Zhang'})
def initDatebase(self): ''' 初始化数据库:爬取整个书籍网站至数据库, :return: ''' # 初始化解析模块 parser = aszwParser.Parser() downloader = aszwDownloader.Downloader() cookies = self.getCookies() # 从傲视中文网的书籍列表中把列表url爬取下来 list_url = parser.find_list_urls() # 遍历列表url得到包含书目录url的列表 for list in list_url: # 从列表页面中解析出书url列表 books_urls = parser.find_books_urls(list) # 遍历书的主页 for book_url in books_urls: # 解析主页得到章节url列表、书名、类别、作者 sections_url, title, category, auth = parser.find_section_urls( book_url) book = {} book['name'] = title book['category'] = category book['auth'] = auth book['wordage'] = -1 book['book_url'] = book_url book['source'] = 1 chapters = [] for section_url in sections_url: # 从cookie库中随机获取一个cookie用于下载页面 cookie = cookies[random.randint(0, 10)] # 遍历章节页面,解析出章节名和正文 html_cont = downloader.m_download(section_url, cookie) new_data = parser.parser_Section(html_cont) # 将章节名和章节url存入chapters chapter = { 'chapter_name': new_data['section_title'], 'chapter_url': section_url } chapters.append(chapter) # 章节信息列表存入book中 book['chapters'] = chapters self.insetBook(book) self.book_warehouse.append(book)
def getBook(self, id): ''' 下载书籍至服务器 :param id: :return: ''' # 获取书目url cursor = self.db.cursor() sql = "select * from books where id = %s" cursor.execute(sql, (id)) row = cursor.fetchone() book_url = row[7] # 初始化解析模块 parser = aszwParser.Parser() downloader = aszwDownloader.Downloader() cookies = self.getCookies() user_agent = self.getUserAgent() proxy_list = proxies.get_proxy('http://www.xicidaili.com/nn/', {'User-agent': 'Mr.Zhang'}) # 解析主页得到章节url列表、书名、类别、作者 sections_url, title, category, auth = parser.find_section_urls( book_url) print('----正在爬取书籍:', title) chapters = [] # 若存在书籍 if os.path.exists("/home/ubuntu/book/" + title + "_" + auth + ".txt"): print(title + "已下载。。。") return # 书籍内容抓取器 outputer = aszwWriter.Writer(len(sections_url), title, auth) # 解析章节信息存入chapters def parseSction(section_url): try: # 从cookie库中随机获取一个cookie用于下载页面 cookie = cookies[random.randint(0, 10)] proxy = random.choice(proxy_list) # 遍历章节页面,解析出章节名和正文 html_cont = downloader.m_download( section_url, cookie=cookie, user_agent=random.choice(user_agent), proxy=proxy) new_data = parser.parser_Section(html_cont) # print('爬取第',new_data['section_title'],'章成功') # 收集章节内容以便章节爬取结束后写入文件 outputer.collect_data(new_data) # 使用外部变量 nonlocal threads, chapters # 将章节名和章节url存入chapters chapter = { 'chapter_name': new_data['section_title'], 'chapter_url': section_url, 'chapter_context': new_data['text'] } chapters.append(chapter) except Exception as e: print(e) finally: # threads线程必须放置在finally中-1,否则当该函数出现bug停掉,则threads不会被-1 # 退出线程,线程数-1 threads -= 1 # 多线程解析章节内容 threads = 0 print('需解析的章节数:', len(sections_url)) while sections_url: while sections_url and threads < 40: # print(threads) threads += 1 section_url = sections_url.pop() _thread.start_new_thread(parseSction, (section_url, )) while threads > 0: # print('已爬取',sum1,'已进入爬取',sum2) # print(threads) pass print('开始写入书籍至本地') # 写入书籍内容到文件 print(outputer.output_html())
def initDatebaseContext(self): ''' 初始化数据库:爬取整个书籍网站至数据库,包括章节内容 :return: ''' # 初始化解析模块 parser = aszwParser.Parser() downloader = aszwDownloader.Downloader() cookies = self.getCookies() user_agent = self.getUserAgent() proxy_list = proxies.get_proxy('http://www.xicidaili.com/nn/', {'User-agent': 'Mr.Zhang'}) # 从傲视中文网的书籍列表中把列表url爬取下来 list_url = parser.find_list_urls() # 遍历列表url得到包含书目录url的列表 for list in list_url: # 从列表页面中解析出书url列表 books_urls = parser.find_books_urls(list) # 遍历书的主页 for book_url in books_urls: # 解析主页得到章节url列表、书名、类别、作者 try: # 若存在书籍 if self.checkBookExist(book_url): continue sections_url, title, category, auth = parser.find_section_urls( book_url) book = {} print('----正在爬取书籍:', title) book['name'] = title book['category'] = category book['auth'] = auth book['wordage'] = -1 book['book_url'] = book_url book['source'] = 1 chapters = [] # 书籍字数 wordage = 0 # 解析章节信息存入chapters def parseSction(section_url, i): # 从cookie库中随机获取一个cookie用于下载页面 cookie = cookies[random.randint(0, 10)] proxy = random.choice(proxy_list) # 遍历章节页面,解析出章节名和正文 html_cont = downloader.m_download( section_url, cookie=cookie, user_agent=random.choice(user_agent), proxy=proxy) new_data = parser.parser_Section(html_cont) try: # 使用外部变量 nonlocal threads, chapters, wordage # 将章节名和章节url存入chapters chapter = { 'chapter_name': i, 'chapter_url': section_url, 'context': new_data['text'] } chapters.append(chapter) wordage += len( new_data['text'] ) - new_data['text'].count(" ") * 4 - 25 except Exception as e: print(section_url, '-----章节内容获取失败') finally: # 退出线程,线程数-1 threads -= 1 # 多线程访问 threads = 0 # 章节url的key值 i = 1 while sections_url: while sections_url and threads < 20: threads += 1 section_url = sections_url.pop(i) _thread.start_new_thread(parseSction, ( section_url, self.i2a(i), )) i += 1 # for section_url in sections_url: # # 从cookie库中随机获取一个cookie用于下载页面 # cookie = cookies[random.randint(0, 10)] # proxy = random.choice(proxy_list) # # 遍历章节页面,解析出章节名和正文 # html_cont = downloader.m_download(section_url,cookie=cookie,user_agent=random.choice(user_agent),proxy=proxy) # new_data = parser.parser_Section(html_cont) # # 将章节名和章节url存入chapters # chapter = {'chapter_name': new_data['section_title'], 'chapter_url': section_url} # chapters.append(chapter) # 章节信息列表存入book中 book['chapters'] = chapters # 书籍字数存入book中 book['wordage'] = wordage self.insetBook(book) self.book_warehouse.append(book) except Exception as e: s = sys.exc_info() print("Error '%s' happened on line %d" % (s[1], s[2].tb_lineno)) books_urls.append(book_url)
def initDatebase(self): ''' 初始化数据库:爬取整个书籍网站至数据库, :return: ''' # 初始化解析模块 parser = aszwParser.Parser() downloader = aszwDownloader.Downloader() cookies = self.getCookies() user_agent = self.getUserAgent() proxy_list = proxies.get_proxy('http://www.xicidaili.com/nn/', {'User-agent': 'Mr.Zhang'}) # 从傲视中文网的书籍列表中把列表url爬取下来 list_url = parser.find_list_urls() # 遍历列表url得到包含书目录url的列表 for list in list_url: # 从列表页面中解析出书url列表 books_urls = parser.find_books_urls(list) # 遍历书的主页 for book_url in books_urls: # 解析主页得到章节url列表、书名、类别、作者 sections_url, title, category, auth = parser.find_section_urls( book_url) book = {} print('----正在爬取书籍:', title) book['name'] = title book['category'] = category book['auth'] = auth book['wordage'] = -1 book['book_url'] = book_url book['source'] = 1 chapters = [] # 若存在书籍 if os.path.exists("/home/ubuntu/book/" + title + "_" + auth + ".txt"): print(title + "已下载。。。") continue outputer = aszwWriter.Writer(len(sections_url), title, auth) # 解析章节信息存入chapters def parseSction(section_url): # 从cookie库中随机获取一个cookie用于下载页面 cookie = cookies[random.randint(0, 10)] proxy = random.choice(proxy_list) # 遍历章节页面,解析出章节名和正文 html_cont = downloader.m_download( section_url, cookie=cookie, user_agent=random.choice(user_agent), proxy=proxy) new_data = parser.parser_Section(html_cont) # 使用外部变量 nonlocal threads, chapters # 将章节名和章节url存入chapters chapter = { 'chapter_name': new_data['section_title'], 'chapter_url': section_url } chapters.append(chapter) # 收集章节内容以便章节爬取结束后写入文件 outputer.collect_data(new_data) # 退出线程,线程数-1 threads -= 1 # 多线程访问 threads = 0 i = 1 while sections_url: while sections_url and threads < 40: threads += 1 section_url = sections_url.pop() _thread.start_new_thread(parseSction, (section_url, )) i += 1 # for section_url in sections_url: # # 从cookie库中随机获取一个cookie用于下载页面 # cookie = cookies[random.randint(0, 10)] # proxy = random.choice(proxy_list) # # 遍历章节页面,解析出章节名和正文 # html_cont = downloader.m_download(section_url,cookie=cookie,user_agent=random.choice(user_agent),proxy=proxy) # new_data = parser.parser_Section(html_cont) # # 将章节名和章节url存入chapters # chapter = {'chapter_name': new_data['section_title'], 'chapter_url': section_url} # chapters.append(chapter) # 章节信息列表存入book中 book['chapters'] = chapters self.insetBook(book) self.book_warehouse.append(book) print('开始写入书籍至本地') # 写入书籍内容到文件 print(outputer.output_html())
def __init__(self): self.downloader = aszwDownloader.Downloader() self.parser = aszwParser.Parser() self.cookies = dbController.dbc('bookwarehouse').getCookies()
def testfind_section_urls(): url='https://www.23zw.me/olread/79/79709/index.html' parser= aszwParser.Parser() parser.find_section_urls(url)