def parser_list(task): response = task['response'] new_tasks = [] if not response: LOG.log_it("Not Response", 'WARN') raise RetryDownload try: data = response.json() except Exception as e: LOG.log_it( '解析JSON出错(如一直出现,而且浏览器能正常访问,可能是网站代码升级,请通知开发者。)\nERRINFO:{}'.format( str(e)), 'WARN') raise RetryDownload try: for each_result in data['result']: title = each_result['title'] url = each_result['url'] date_group = re.search('(.*?)T(.*?)\+', each_result['date_created']) date = date_group.group(1) + ' ' + date_group.group(2) meta = deepcopy(task['meta']) save = deepcopy(task['save']) save.update({'title': title, 'date': date}) new_task = Task.make_task({ 'url': url, 'method': 'GET', 'parser': parser_content, 'resulter': resulter_content, 'priority': 1, 'meta': meta, 'save': save }) new_tasks.append(new_task) except KeyError: LOG.log_it('JSON KEY出错(如一直出现,而且浏览器能正常访问,可能是网站代码升级,请通知开发者。)', 'WARN') raise RetryDownload # 获取下一页 meta = deepcopy(task['meta']) save = deepcopy(task['save']) save['cursor'] += 20 if save['cursor'] < save['end'] and not len(data['result']) < 20: new_task = Task.make_task({ 'url': API_URL.format(save['cursor']), 'method': 'GET', 'meta': meta, 'parser': parser_list, 'priority': 0, 'save': save, 'retry': 3, }) new_tasks.append(new_task) return None, new_tasks
def parser_content(task): title = task['title'] new_tasks = [] response = task['response'] if not response: raise RetryDownload bs = BeautifulSoup(response.text, 'lxml') content_tab = bs.select('.PostIndex-content') if content_tab: content = str(content_tab[0]) else: LOG.log_it("不能找到文章的内容。(如一直出现,而且浏览器能正常访问知乎,可能是知乎代码升级,请通知开发者。)", 'WARN') raise RetryDownload author_name = bs.select('.PostIndex-authorName')[0].string if bs.select( '.PostIndex-authorName') else '' voteup_count = re.search( 'likesCount":(\d+),', response.text).group(1) if re.search( 'likesCount":(\d+),', response.text) else '' created_time = str( bs.select('.PostIndex-header .HoverTitle')[1]['data-hover-title'] ) if len(bs.select('.PostIndex-header .HoverTitle')) == 2 else '' article_url = task['url'] download_img_list, content = format_zhihu_content(content, task) item = [ md5string(article_url), title, content, created_time, voteup_count, author_name, int(time.time() * 100000) ] if task['save']['kw'].get('img', True): img_header = deepcopy(DEFAULT_HEADERS) img_header.update({'Referer': response.url}) for img_url in download_img_list: new_tasks.append( Task.make_task({ 'url': img_url, 'method': 'GET', 'meta': { 'headers': img_header, 'verify': False }, 'parser': parser_downloader_img, 'resulter': resulter_downloader_img, 'save': task['save'], 'priority': 10, })) task.update({"parsed_data": item}) return task, new_tasks
def parser_content(task): title = task['title'] new_tasks = [] response = task['response'] if not response: raise RetryDownload bs = BeautifulSoup(response.text, 'lxml') content_tab = bs.select('.show-content') if content_tab: content = str(content_tab[0]) else: LOG.log_it("不能找到文章的内容。(如一直出现,而且浏览器能正常访问网站,可能是网站代码升级,请通知开发者。)", 'WARN') raise RetryDownload author_name = bs.select('.post .author .name a')[0].string if bs.select( '.post .author .name a') else '' voteup_count = bs.select( '.post .author .meta .likes-count')[0].string if bs.select( '.post .author .meta .likes-count') else '' created_time = bs.select( '.post .author .meta .publish-time')[0].string if bs.select( '.post .author .meta .publish-time') else '' article_url = task['url'] download_img_list, content = format_content(content, task) item = [ md5string(article_url), title, content, created_time, voteup_count, author_name, int(time.time() * 100000) ] if task['save']['kw'].get('img', True): img_header = deepcopy(DEFAULT_HEADERS) img_header.update({'Referer': response.url}) for img_url in download_img_list: new_tasks.append( Task.make_task({ 'url': img_url, 'method': 'GET', 'meta': { 'headers': img_header, 'verify': False }, 'parser': parser_downloader_img, 'resulter': resulter_downloader_img, 'save': task['save'], 'priority': 10, })) task.update({"parsed_data": item}) return task, new_tasks
def parser_content(task): title = task['title'] new_tasks = [] response = task['response'] if not response: raise RetryDownload try: content = response.json()['body'] except Exception as e: LOG.log_it( '解析JSON出错(如一直出现,而且浏览器能正常访问网站,可能是网站代码升级,请通知开发者。)ERRINFO:{}'.format( str(e)), 'WARN') raise RetryDownload bs = BeautifulSoup(content, 'lxml') content = str(bs.select('div.content')[0]) author_name = bs.select('.author')[0].string if bs.select( '.author') else '' voteup_count = '' created_time = '' article_url = task['url'] download_img_list, content = format_zhihu_content(content, task) item = [ md5string(article_url), title, content, created_time, voteup_count, author_name, int(time.time() * 100000) ] if task['save']['kw'].get('img', True): img_header = deepcopy(DEFAULT_HEADERS) img_header.update({'Referer': response.url}) for img_url in download_img_list: new_tasks.append( Task.make_task({ 'url': img_url, 'method': 'GET', 'meta': { 'headers': img_header, 'verify': False }, 'parser': parser_downloader_img, 'resulter': resulter_downloader_img, 'save': task['save'], 'priority': 10, })) task.update({"parsed_data": item}) return task, new_tasks
def parser_list(task): response = task['response'] new_tasks = [] to_next = True if not response: raise RetryDownload try: data = response.json()['stories'] except Exception as e: LOG.log_it( '解析JSON出错(如一直出现,而且浏览器能正常访问网站,可能是网站代码升级,请通知开发者。)ERRINFO:{}'.format( str(e)), 'WARN') raise RetryDownload for item in data: # 如果在数据库里面已经存在的项目,就不继续爬了 url = 'http://news-at.zhihu.com/api/4/story/' + str(item['id']) if md5string(url) in ARTICLE_ID_SET: to_next = False continue new_task = Task.make_task({ 'url': url, 'method': 'GET', 'meta': task['meta'], 'parser': parser_content, 'resulter': resulter_content, 'priority': 5, 'save': task['save'], 'title': item['title'], }) new_tasks.append(new_task) # 下一页 if not IS_TODAY_URL and to_next: next_datetime = get_next_datetime_string(task['save']['cursor'], '%Y%m%d', 1) # 始终会到相等的时候 if compare_datetime_string(task['save']['end'], next_datetime, '%Y%m%d') and len(data) != 0: next_page_task = deepcopy(task) next_page_task.update({ 'url': re.sub('before/\d+', 'before/{}'.format(next_datetime), next_page_task['url']) }) next_page_task['save'].update({'cursor': next_datetime}) new_tasks.append(next_page_task) return None, new_tasks
def parser_list(task): response = task['response'] new_tasks = [] if not response: raise RetryDownload try: data = response.json() data.reverse() except Exception as e: LOG.log_it( '解析JSON出错(如一直出现,而且浏览器能正常访问知乎,可能是知乎代码升级,请通知开发者。)ERRINFO:{}'.format( str(e)), 'WARN') raise RetryDownload if len(data) != 0: if task['save']['cursor'] < task['save']['end'] - 20: next_page_task = deepcopy(task) next_page_task.update({ 'url': re.sub('offset=\d+', 'offset={}'.format(task['save']['cursor'] + 20), next_page_task['url']) }) next_page_task['save'].update( {'cursor': next_page_task['save']['cursor'] + 20}) new_tasks.append(next_page_task) else: LOG.log_it('不能读取专栏列表。(如一直出现,而且浏览器能正常访问知乎,可能是知乎代码升级,请通知开发者。)', 'WARN') raise RetryDownload for item in data: new_task = Task.make_task({ 'url': 'https://zhuanlan.zhihu.com' + item['url'], 'method': 'GET', 'meta': task['meta'], 'parser': parser_content, 'resulter': resulter_content, 'priority': 5, 'save': task['save'], 'title': item['title'], }) new_tasks.append(new_task) return None, new_tasks
def parser_list(task): response = task['response'] new_tasks = [] to_next = True if not response: raise RetryDownload try: data = response.json() data.reverse() except Exception as e: LOG.log_it( '解析JSON出错(如一直出现,而且浏览器能正常访问知乎,可能是知乎代码升级,请通知开发者。)ERRINFO:{}'.format( str(e)), 'WARN') raise RetryDownload for item in data: # 如果在数据库里面已经存在的项目,就不继续爬了 url = 'https://zhuanlan.zhihu.com' + item['url'] if md5string(url) in ARTICLE_ID_SET: to_next = False continue new_task = Task.make_task({ 'url': url, 'method': 'GET', 'meta': task['meta'], 'parser': parser_content, 'resulter': resulter_content, 'priority': 5, 'save': task['save'], 'title': item['title'], }) new_tasks.append(new_task) # 下一页 if to_next and len(data) != 0: if task['save']['cursor'] < task['save']['end'] - 20: next_page_task = deepcopy(task) next_page_task.update({ 'url': re.sub('offset=\d+', 'offset={}'.format(task['save']['cursor'] + 20), next_page_task['url']) }) next_page_task['save'].update( {'cursor': next_page_task['save']['cursor'] + 20}) new_tasks.append(next_page_task) return None, new_tasks
def main(start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q) default_headers = deepcopy(DEFAULT_HEADERS) default_headers.update({'Referer': 'http://www.guokr.com/scientific/'}) save_path = SCRIPT_CONFIG['SAVE_PATH'] book_name = '果壳网' task = Task.make_task({ 'url': API_URL.format(start), 'method': 'GET', 'meta': { 'headers': default_headers, 'verify': False }, 'parser': parser_list, 'priority': 0, 'save': { 'cursor': start, 'start': start, 'end': end, 'kw': kw, 'save_path': SCRIPT_CONFIG['SAVE_PATH'], }, 'retry': 3, }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: pass crawler.start() items = [] with ArticleDB(save_path) as db: items.extend(db.select_article()) db.insert_meta_data(['BOOK_NAME', book_name]) db.increase_version() with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) if kw.get('email'): with SendEmail2Kindle() as s: s.send_all_mobi(SCRIPT_CONFIG['SAVE_PATH']) os._exit(0)
def test_normal(self): def parser_mock(task): self.assertEqual(task['response'].text, "Hello World!") return None, None task = Task.make_task({ 'url': "http://127.0.0.1:5000/hello_world", 'method': 'GET', 'parser': parser_mock, 'priority': 0, }) self.iq.put(task) self.crawler.start()
def parser_content(task): title = task['title'] items = [] new_tasks = [] response = task['response'] if not response: raise RetryDownload response.encoding = 'utf-8' bs = BeautifulSoup(response.text, 'lxml') content_tab = bs.select('.article-detail-bd > .detail') if content_tab: content = str(content_tab[0]) else: LOG.log_it("不能找到文章的内容。(如一直出现,而且浏览器能正常访问,可能是代码升级,请通知开发者。)", 'WARN') raise RetryDownload author_name = '未知' voteup_count = task['voteup_count'] created_time = task['created_time'] article_url = task['url'] article_id = md5string(article_url) download_img_list, content = format_content(content, task) items.append([article_id, title, content, created_time, voteup_count, author_name, int(time.time() * 100000)]) if task['save']['kw'].get('img', True): img_header = deepcopy(DEFAULT_HEADERS) img_header.update({'Referer': response.url}) for img_url in download_img_list: new_tasks.append(Task.make_task({ 'url': img_url, 'method': 'GET', 'meta': {'headers': img_header, 'verify': False}, 'parser': parser_downloader_img, 'resulter': resulter_downloader_img, 'save': task['save'], 'priority': 10, })) task.update({'parsed_data': items}) return task, new_tasks
def parser_content(task): response = task['response'] if not response: LOG.log_it("Not Response", 'WARN') raise RetryDownload new_tasks = [] items = [] content = response.text # 去除空格 content = content.replace('</p><p>', '').replace('<br/>', '') soup = BeautifulSoup(content, 'lxml') title = task['save']['title'] article_url = task['url'] created_time = soup.select('.content-th-info span')[0].string[3:] author = soup.select('.content-th-info a')[0].string download_img_list, content = format_content(soup, task) items.append([ md5string(article_url), title, content, created_time, '', author, int(time.time() * 100000) ]) if task['save']['kw'].get('img', True): img_header = deepcopy(DEFAULT_HEADERS) img_header.update({'Referer': response.url}) for img_url in download_img_list: new_tasks.append( Task.make_task({ 'url': img_url, 'method': 'GET', 'meta': { 'headers': img_header, 'verify': False }, 'parser': parser_downloader_img, 'resulter': resulter_downloader_img, 'priority': 2, 'save': task['save'] })) task.update({'parsed_data': items}) return task, new_tasks
def mock_downloader_to_push_in_delayqueue(self): def parser_mock(): return None, None tasks = [ Task.make_task({ 'url': 'http://www.baidu.com?a={}'.format(random.randint(1, 999)), 'method': 'GET', 'retry_delay': random.random() * SEC, 'parser': parser_mock, }) for _ in range(20) ] for task in tasks: # time.sleep(random.random()) task['to_download_timestamp'] = time.time() + task['retry_delay'] TaskManager.push_delay_queue(task)
def test_retry_delay(self): delay = 1 t = time.time() - delay def parser_mock(task): nonlocal t self.assertTrue(time.time() - t >= delay) t = time.time() raise RetryDownload return None, None task = Task.make_task({ 'url': "http://127.0.0.1:5000/retry_delay", 'method': 'GET', 'parser': parser_mock, 'priority': 0, 'retry_delay': delay, 'retry': 3 }) self.iq.put(task) self.crawler.start()
def main(start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1), MAIN_CONFIG.get('DOWNLOADER_WORKER', 1), MAIN_CONFIG.get('RESULTER_WORKER', 1)) default_headers = deepcopy(DEFAULT_HEADERS) default_headers.update({'Referer': 'http://www.guokr.com/scientific/'}) save_path = SCRIPT_CONFIG['SAVE_PATH'] book_name = '果壳网' task = Task.make_task({ 'url': API_URL.format(start), 'method': 'GET', 'meta': { 'headers': default_headers, 'verify': False }, 'parser': parser_list, 'priority': 0, 'save': { 'cursor': start, 'start': start, 'end': end, 'kw': kw, 'save_path': SCRIPT_CONFIG['SAVE_PATH'], }, 'retry': 10, 'retry_delay': 10 }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: _ = db.select_all_article_id() if _: for each in _: ARTICLE_ID_SET.add(each[0]) crawler.start() items = [] with ArticleDB(save_path) as db: items.extend(db.select_article()) db.insert_meta_data(['BOOK_NAME', book_name]) db.increase_version() db.reset() if items: new = True with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) else: LOG.log_it('无新项目', 'INFO') new = False if new and kw.get('email'): with SendEmail2Kindle() as s: s.send_all_mobi(SCRIPT_CONFIG['SAVE_PATH'])
def parser_content(task): response = task['response'] if not response: LOG.log_it("Not Response", 'WARN') raise RetryDownload new_tasks = [] download_img_list = [] items = [] soup = BeautifulSoup(response.text, 'lxml') content_select = soup.select('.document') # 移除每页后面无用的信息 if content_select: for to_del in soup.select('.copyright'): to_del.decompose() content = str(content_select) # bs4会自动加html和body 标签 content = re.sub('<html><body>(.*?)</body></html>', lambda x: x.group(1), content, flags=re.S) download_img_list.extend(re.findall('src="(http.*?)"', content)) # 更换为本地相对路径 content = re.sub('src="(.*?)"', convert_link, content) # 去掉"[]" content = content[1:-1] title = task['save']['title'] article_url = task['url'] created_time = soup.select('.content-th-info span')[0].string[3:] author = soup.select('.content-th-info a')[0].string bs2 = BeautifulSoup(content, 'lxml') # 居中图片 for tab in bs2.select('img'): tab.wrap(bs2.new_tag('div', style='text-align:center;')) tab['style'] = "display: inline-block;" # 删除gif if task['save']['kw']['gif'] is False: if 'gif' in tab['src']: tab.decompose() continue content = str(bs2) items.append([ md5string(article_url), title, content, created_time, '', author, int(time.time() * 100000) ]) if task['save']['kw'].get('img', True): img_header = deepcopy(DEFAULT_HEADERS) img_header.update({'Referer': response.url}) for img_url in download_img_list: new_tasks.append( Task.make_task({ 'url': img_url, 'method': 'GET', 'meta': { 'headers': img_header, 'verify': False }, 'parser': parser_downloader_img, 'resulter': resulter_downloader_img, 'priority': 2, 'save': task['save'] })) task.update({'parsed_data': items}) return task, new_tasks
def main(zhuanti_list, start, end, kw): """start默认1;end为结束页数,每页9个""" iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1), MAIN_CONFIG.get('DOWNLOADER_WORKER', 1), MAIN_CONFIG.get('RESULTER_WORKER', 1)) start = int(start) end = int(end) for zhuanti in zhuanti_list: new_header = deepcopy(DEFAULT_HEADERS) new_header.update({'Referer': BASE_URL.format(zhuanti)}) # 以专题的数字作为子文件名 save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanti)) if kw.get('order_by') == 'comment': order_by = ORDER_COMMENT elif kw.get('order_by') == 'add': order_by = ORDER_ADD elif kw.get('order_by') == 'top': order_by = ORDER_TOP else: # 默认add order_by = ORDER_ADD task = Task.make_task({ 'url': API_URL.format(zhuanti, order_by, start), 'method': 'GET', 'meta': {'headers': new_header, 'verify': False}, 'parser': parser_list, 'priority': 0, 'save': {'cursor': start, 'save_path': save_path, 'start': start, 'end': end, 'kw': kw, 'name': zhuanti, 'order_by': order_by}, 'retry': 10, 'retry_delay': 10 }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: _ = db.select_all_article_id() # 利用集合去重 if _: for each in _: ARTICLE_ID_SET.add(each[0]) # 开始爬虫 crawler.start() # 开始制作电子书 for zhuanti in zhuanti_list: items = [] save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanti)) with ArticleDB(save_path, VERSION=0) as db: # 读取所有文章 items.extend(db.select_article()) # 从数据库中获取专题名字 book_name = db.select_meta('BOOK_NAME') # 更新数据库版本 db.increase_version() # 数据库收尾工作 db.reset() if items: with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) if kw.get('email'): with SendEmail2Kindle() as s: s.send_all_mobi(save_path) else: LOG.log_it('无新项目', 'INFO')
def parser_collection(task): to_next = True response = task['response'] if not response: raise RetryDownload text = response.text bs = BeautifulSoup(text, 'lxml') download_img_list = [] new_tasks = [] items = [] # 获得当前页码 now_page_num = int(re.search('page=(\d*)$', response.url).group(1)) if re.search( 'page=(\d*)$', response.url) else 1 if not bs.select('.zm-item'): LOG.log_it("无法获取收藏列表(如一直出现,而且浏览器能正常访问知乎,可能是知乎代码升级,请通知开发者。)", 'WARN') raise RetryDownload collection_name = bs.select( '.zm-item-title')[0].string.strip() + ' 第{}页'.format(now_page_num) LOG.log_it("获取收藏夹[{}]".format(collection_name), 'INFO') book_name = bs.select('#zh-fav-head-title')[0].string.strip() if bs.select( '#zh-fav-head-title') else task['save']['name'] for i in bs.select('.zm-item'): article_url = i.select('.zm-item-fav a.toggle-expand')[0].attrs['href'] article_id = md5string(article_url) # 如果在数据库里面已经存在的项目,就不继续爬了 if article_id not in ARTICLE_ID_SET: author_name = i.select( '.answer-head a.author-link')[0].string if i.select( '.answer-head a.author-link') else '匿名' title = i.select('.zm-item-title a')[0].string if i.select( '.zm-item-title a') else '' voteup_count = i.select( 'a.zm-item-vote-count')[0].string if i.select( 'a.zm-item-vote-count') else '' created_time = i.select('p.visible-expanded a')[0].string.replace( '发布于 ', '') if i.select('p.visible-expanded a') else '' content = i.select('.content')[0].string if i.select( '.content') else '' _, content = format_zhihu_content(content, task) download_img_list.extend(_) items.append([ article_id, title, content, created_time, voteup_count, author_name, int(time.time() * 100000) ]) else: to_next = False # 获取下一页 if to_next and now_page_num < task['save']['end']: next_page = bs.select('.zm-invite-pager span a') # 如果有下一页 if next_page and next_page[-1].string == '下一页': next_page = re.sub('\?page=\d+', '', task['url']) + next_page[-1]['href'] new_tasks.append( Task.make_task({ 'url': next_page, 'method': 'GET', 'priority': 0, 'save': task['save'], 'meta': task['meta'], 'parser': parser_collection, 'resulter': resulter_collection, })) # 获取图片 if task['save']['kw'].get('img', True): img_header = deepcopy(DEFAULT_HEADERS) img_header.update({'Referer': response.url}) for img_url in download_img_list: new_tasks.append( Task.make_task({ 'url': img_url, 'method': 'GET', 'meta': { 'headers': img_header, 'verify': False }, 'parser': parser_downloader_img, 'resulter': resulter_downloader_img, 'priority': 5, 'save': task['save'] })) if items: task.update({'parsed_data': items}) task['save'].update({'book_name': book_name}) return task, new_tasks else: return None, new_tasks
def main(zhuanlan_name_list, start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q) for zhuanlan_name in zhuanlan_name_list: new_header = deepcopy(DEFAULT_HEADERS) new_header.update( {'Referer': 'https://zhuanlan.zhihu.com/{}'.format(zhuanlan_name)}) save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanlan_name)) task = Task.make_task({ 'url': 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=20&offset={}' .format(zhuanlan_name, start), 'method': 'GET', 'meta': { 'headers': new_header, 'verify': False }, 'parser': parser_list, 'priority': 0, 'save': { 'cursor': start, 'save_path': save_path, 'start': start, 'end': end, 'kw': kw, 'name': zhuanlan_name }, 'retry': 3, }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: pass crawler.start() for zhuanlan_name in zhuanlan_name_list: items = [] save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanlan_name)) with ArticleDB(save_path, VERSION=0) as db: db.insert_meta_data(['BOOK_NAME', zhuanlan_name]) items.extend(db.select_article()) db.increase_version() with HTML2Kindle(items, save_path, zhuanlan_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) if kw.get('email'): for zhuanlan_name in zhuanlan_name_list: with SendEmail2Kindle() as s: s.send_all_mobi( os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanlan_name))) os._exit(0)
def parser_list(task): response = task['response'] new_tasks = [] to_next = True if not response: raise RetryDownload try: text = response.text bs = BeautifulSoup(text, 'lxml') except Exception as e: LOG.log_it('解析网页出错(如一直出现,而且浏览器能正常访问,可能是网站网站代码升级,请通知开发者。)ERRINFO:{}' .format(str(e)), 'WARN') raise RetryDownload book_name = bs.title.string if bs.title else task['save']['name'] # 插入文集名字 with ArticleDB(task['save']['save_path']) as article_db: article_db.insert_meta_data(['BOOK_NAME', format_file_name('简书专题_' + book_name)], update=False) # 顺序反向 items = bs.select('a.title') items.reverse() for item in items: # 如果已经在数据库中,则不下载 url = 'https://www.jianshu.com' + item.attrs['href'] if md5string(url) in ARTICLE_ID_SET: to_next = False continue try: title = item.string except: LOG.log_it('解析标题出错(如一直出现,而且浏览器能正常访问,可能是网站网站代码升级,请通知开发者。)', 'WARN') raise RetryDownload new_task = Task.make_task({ 'url': url, 'method': 'GET', 'meta': task['meta'], 'parser': parser_content, 'resulter': resulter_content, 'priority': 5, 'save': task['save'], 'title': title, }) new_tasks.append(new_task) # 下一页 if to_next and len(items) != 0: if task['save']['cursor'] < task['save']['end']: next_page_task = deepcopy(task) next_page_task.update( {'url': API_URL.format(task['save']['name'], task['save']['order_by'], task['save']['cursor'] + 1)}) next_page_task['save'].update({'cursor': next_page_task['save']['cursor'] + 1}) new_tasks.append(next_page_task) return None, new_tasks
def main(collection_num_list, start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1), MAIN_CONFIG.get('DOWNLOADER_WORKER', 1), MAIN_CONFIG.get('RESULTER_WORKER', 1)) new = True for collection_num in collection_num_list: save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(collection_num)) task = Task.make_task({ 'url': 'https://www.zhihu.com/collection/{}?page={}'.format( collection_num, start), 'method': 'GET', 'meta': { 'headers': DEFAULT_HEADERS, 'verify': False }, 'parser': parser_collection, 'resulter': resulter_collection, 'priority': 0, 'retry': 10, 'save': { 'start': start, 'end': end, 'kw': kw, 'save_path': save_path, 'name': collection_num, }, 'retry_delay': 10 }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: _ = db.select_all_article_id() if _: for each in _: ARTICLE_ID_SET.add(each[0]) crawler.start() for collection_num in collection_num_list: items = [] save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(collection_num)) with ArticleDB(save_path) as db: items.extend(db.select_article()) book_name = db.select_meta('BOOK_NAME') db.increase_version() db.reset() if items: new = True with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) else: LOG.log_it('无新项目', 'INFO') new = False if new and kw.get('email'): for collection_num in collection_num_list: save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(collection_num)) with SendEmail2Kindle() as s: s.send_all_mobi(save_path)
def parser_content(task): title = task['title'] download_img_list = [] new_tasks = [] response = task['response'] if not response: raise RetryDownload bs = BeautifulSoup(response.text, 'lxml') content_tab = bs.select('.PostIndex-content') if content_tab: content = str(content_tab[0]) else: LOG.log_it("不能找到文章的内容。(如一直出现,而且浏览器能正常访问知乎,可能是知乎代码升级,请通知开发者。)", 'WARN') raise RetryDownload author_name = bs.select('.PostIndex-authorName')[0].string if bs.select( '.PostIndex-authorName') else '' voteup_count = re.search( 'likesCount":(\d+),', response.text).group(1) if re.search( 'likesCount":(\d+),', response.text) else '' created_time = str( bs.select('.PostIndex-header .HoverTitle')[1]['data-hover-title'] ) if len(bs.select('.PostIndex-header .HoverTitle')) == 2 else '' article_url = task['url'] bs = BeautifulSoup(content, 'lxml') for tab in bs.select('img[src^="data"]'): # 删除无用的img标签 tab.decompose() # 居中图片 for tab in bs.select('img'): if 'equation' not in tab['src']: tab.wrap(bs.new_tag('div', style='text-align:center;')) tab['style'] = "display: inline-block;" # 删除gif if task['save']['kw']['gif'] is False: if 'gif' in tab['src']: tab.decompose() continue content = str(bs) # bs4会自动加html和body 标签 content = re.sub('<html><body>(.*?)</body></html>', lambda x: x.group(1), content, flags=re.S) # 公式地址转换(傻逼知乎又换地址了) # content = content.replace('//www.zhihu.com', 'http://www.zhihu.com') download_img_list.extend(re.findall('src="(http.*?)"', content)) # 更换为本地相对路径 content = re.sub('src="(.*?)"', convert_link, content) # 超链接的转换 content = re.sub('//link.zhihu.com/\?target=(.*?)"', lambda x: unquote(x.group(1)), content) content = re.sub('<noscript>(.*?)</noscript>', lambda x: x.group(1), content, flags=re.S) item = [ md5string(article_url), title, content, created_time, voteup_count, author_name, int(time.time() * 100000) ] if task['save']['kw'].get('img', True): img_header = deepcopy(DEFAULT_HEADERS) img_header.update({'Referer': response.url}) for img_url in download_img_list: new_tasks.append( Task.make_task({ 'url': img_url, 'method': 'GET', 'meta': { 'headers': img_header, 'verify': False }, 'parser': parser_downloader_img, 'resulter': resulter_downloader_img, 'save': task['save'], 'priority': 10, })) task.update({"parsed_data": item}) return task, new_tasks
def main(start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1), MAIN_CONFIG.get('DOWNLOADER_WORKER', 1), MAIN_CONFIG.get('RESULTER_WORKER', 1)) new_header = deepcopy(DEFAULT_HEADERS) global IS_TODAY_URL if start is None: IS_TODAY_URL = True save_path = os.path.join( SCRIPT_CONFIG['SAVE_PATH'], 'zhihu_daily_' + get_datetime_string('%Y%m%d')) book_name = '知乎日报_' + get_datetime_string('%Y%m%d') else: if end is None: end = datetime.datetime.fromtimestamp( time.time()).strftime('%Y%m%d') save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], 'zhihu_daily_{}_{}'.format(start, end)) book_name = '知乎日报_{}_{}'.format(start, end) IS_TODAY_URL = False url = TODAY_URL if IS_TODAY_URL else YESTERDAY_URL.format(start) task = Task.make_task({ 'url': url, 'method': 'GET', 'meta': { 'headers': new_header, 'verify': False }, 'parser': parser_list, 'priority': 0, 'save': { 'cursor': start, 'save_path': save_path, 'start': start, 'end': end, 'kw': kw }, 'retry': 99, 'retry_delay': 10 }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: _ = db.select_all_article_id() if _: for each in _: ARTICLE_ID_SET.add(each[0]) crawler.start() items = [] with ArticleDB(save_path, VERSION=0) as db: db.insert_meta_data(['BOOK_NAME', book_name]) items.extend(db.select_article()) db.increase_version() db.reset() if items: new = True with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) else: LOG.log_it('无新项目', 'INFO') new = False if new and kw.get('email'): with SendEmail2Kindle() as s: s.send_all_mobi(os.path.join(save_path))
def main(start, end, kw): # start:2017/12/11 # end:2017/12/12 iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q) try: start_l = [int(_) for _ in start.split('-')] end_l = [int(_) for _ in end.split('-')] start_t = int( datetime.datetime(start_l[0], start_l[1], start_l[2]).timestamp()) + 60 * 60 * 24 end_t = int( datetime.datetime(end_l[0], end_l[1], end_l[2]).timestamp()) except: LOG.log_it('日期格式错误', 'WARN') traceback.print_exc() return global API_URL if 'type' in kw: if kw['type'] == 'business': API_URL = API_BUSINESS elif kw['type'] == 'intelligent': API_URL = API_INTELLIGENT elif kw['type'] == 'design': API_URL = API_DESIGN elif kw['type'] == 'fashion': API_URL = API_FASHION elif kw['type'] == 'entertainment': API_URL = API_ENTERTAINMENT elif kw['type'] == 'city': API_URL = API_CITY elif kw['type'] == 'game': API_URL = API_GAME elif kw['type'] == 'long': API_URL = API_LONG elif kw['type'] == 'home': pass else: kw.update({'type': 'home'}) new_header = deepcopy(SCRIPT_CONFIG.get('DEFAULT_HEADERS')) new_header.update({'Referer': 'https://www.qdaily.com/'}) save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], 'qdaily_{}'.format(kw['type'])) book_name = '好奇心日报_{}_{}_{}'.format(kw['type'], start, end) task = Task.make_task({ 'url': API_URL.format(start_t), 'method': 'GET', 'meta': { 'headers': new_header, 'verify': False }, 'parser': parser_list, 'priority': 0, 'save': { 'cursor': start_t, 'save_path': save_path, 'start': start_t, 'end': end_t, 'kw': kw, 'page': 1, 'name': book_name, }, 'retry': 3, }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: pass crawler.start() items = [] with ArticleDB(save_path) as db: items.extend(db.select_article()) db.insert_meta_data(['BOOK_NAME', book_name]) db.increase_version() with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) if kw.get('email'): with SendEmail2Kindle() as s: s.send_all_mobi(save_path) os._exit(0)
def parser_content(task): title = task['title'] items = [] download_img_list = [] new_tasks = [] response = task['response'] if not response: raise RetryDownload response.encoding = 'utf-8' bs = BeautifulSoup(response.text, 'lxml') content_tab = bs.select('.article-detail-bd > .detail') if content_tab: content = str(content_tab[0]) else: LOG.log_it("不能找到文章的内容。(如一直出现,而且浏览器能正常访问,可能是代码升级,请通知开发者。)", 'WARN') raise RetryDownload author_name = '未知' voteup_count = task['voteup_count'] created_time = task['created_time'] article_url = task['url'] bs = BeautifulSoup(content, 'lxml') # 居中图片 for tab in bs.select('img'): if len(tab.attrs['class']) != 1: tab.decompose() continue # 删除gif if task['save']['kw']['gif'] is False: if 'gif' in tab['data-src']: tab.decompose() continue tab.wrap(bs.new_tag('div', style='text-align:center;')) tab['style'] = "display: inline-block;" content = str(bs) # bs4会自动加html和body 标签 content = re.sub('<html><body>(.*?)</body></html>', lambda x: x.group(1), content, flags=re.S) download_img_list.extend(re.findall('src="(http.*?)"', content)) # 更换为本地相对路径 content = re.sub('src="(.*?)"', convert_link, content) content = content.replace('data-src', 'src') items.append([ md5string(article_url), title, content, created_time, voteup_count, author_name, int(time.time() * 100000) ]) if task['save']['kw'].get('img', True): img_header = deepcopy(SCRIPT_CONFIG.get('DEFAULT_HEADERS')) img_header.update({'Referer': response.url}) for img_url in download_img_list: new_tasks.append( Task.make_task({ 'url': img_url, 'method': 'GET', 'meta': { 'headers': img_header, 'verify': False }, 'parser': parser_downloader_img, 'resulter': resulter_downloader_img, 'save': task['save'], 'priority': 10, })) task.update({'parsed_data': items}) return task, new_tasks
def parser_list(task): response = task['response'] new_tasks = [] opf = [] if not response: raise RetryDownload try: data = response.json() except Exception as e: LOG.log_it( '解析JSON出错(如一直出现,而且浏览器能正常访问,可能是代码升级,请通知开发者。)ERRINFO:{}'.format( str(e)), 'WARN') raise RetryDownload try: # Next page if len(data['data']) != 0: if data['data']['last_key'] > task['save']['end'] - 144209: next_page_task = deepcopy(task) next_page_task.update( {'url': API_URL.format(data['data']['last_key'])}) next_page_task['save'].update({ 'cursor': data['data']['last_key'], 'page': task['save']['page'] + 1 }) new_tasks.append(next_page_task) else: LOG.log_it('不能读取专栏列表。(如一直出现,而且浏览器能正常访问,可能是代码升级,请通知开发者。)', 'WARN') raise RetryDownload for item in data['data']['feeds']: if item['datatype'] == 'article': item = item['post'] # 文件名太长无法制作mobi title = item['title'] if len(title) > 55: _ = 55 - len(title) - 3 title = title[:_] + '...' opf.append({'href': format_file_name(title, '.html')}) new_task = Task.make_task({ 'url': 'https://www.qdaily.com/articles/{}.html'.format( str(item['id'])), 'method': 'GET', 'meta': task['meta'], 'parser': parser_content, 'resulter': resulter_content, 'priority': 5, 'save': task['save'], 'title': item['title'], 'created_time': item['publish_time'], 'voteup_count': item['praise_count'] }) new_tasks.append(new_task) except KeyError: LOG.log_it('JSON KEY出错(如一直出现,而且浏览器能正常访问,可能是网站代码升级,请通知开发者。)', 'WARN') raise RetryDownload return None, new_tasks
def main(zhuanlan_name_list, start, end, kw): iq = PriorityQueue() oq = PriorityQueue() result_q = Queue() crawler = Crawler(iq, oq, result_q, MAIN_CONFIG.get('PARSER_WORKER', 1), MAIN_CONFIG.get('DOWNLOADER_WORKER', 1), MAIN_CONFIG.get('RESULTER_WORKER', 1)) new = True for zhuanlan_name in zhuanlan_name_list: new_header = deepcopy(DEFAULT_HEADERS) new_header.update( {'Referer': 'https://zhuanlan.zhihu.com/{}'.format(zhuanlan_name)}) save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanlan_name)) task = Task.make_task({ 'url': 'https://zhuanlan.zhihu.com/api/columns/{}/posts?limit=20&offset={}' .format(zhuanlan_name, start), 'method': 'GET', 'meta': { 'headers': new_header, 'verify': False }, 'parser': parser_list, 'priority': 0, 'save': { 'cursor': start, 'save_path': save_path, 'start': start, 'end': end, 'kw': kw, 'name': zhuanlan_name }, 'retry': 10, 'retry_delay': 10 }) iq.put(task) # Init DB with ArticleDB(save_path, VERSION=0) as db: _ = db.select_all_article_id() if _: for each in _: ARTICLE_ID_SET.add(each[0]) crawler.start() for zhuanlan_name in zhuanlan_name_list: items = [] book_name = '知乎专栏_{}'.format(zhuanlan_name) save_path = os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanlan_name)) with ArticleDB(save_path, VERSION=0) as db: db.insert_meta_data(['BOOK_NAME', zhuanlan_name]) items.extend(db.select_article()) db.increase_version() db.reset() if items: new = True with HTML2Kindle(items, save_path, book_name, MAIN_CONFIG.get('KINDLEGEN_PATH')) as html2kindle: html2kindle.make_metadata(window=kw.get('window', 50)) html2kindle.make_book_multi(save_path) else: LOG.log_it('无新项目', 'INFO') new = False if new and kw.get('email'): for zhuanlan_name in zhuanlan_name_list: with SendEmail2Kindle() as s: s.send_all_mobi( os.path.join(SCRIPT_CONFIG['SAVE_PATH'], str(zhuanlan_name)))