def parse_list_page(job): response, links = HtmlResponse(url=job.url, body=job.rsp, encoding='utf8'), [] if job.action == 11: # ershicimi links = response.selector.xpath("//*[@class='weui_media_title']/a/@href").extract() elif job.action == 12: # wemp links = response.selector.xpath("//*[@class='post-item__main']//a[@class='post-item__title']/@href").extract() elif job.action == 13: # chansongme links = response.selector.xpath("//*[@class='feed_item_question']//a[@class='question_link']/@href").extract() elif job.action == 14: links = response.selector.xpath("//div[@class='grid news_desc']/h3/a/@href").extract() if not links: logger.warning(f"页面解析失败:`{job.url}") return 4 for link in links: link = urllib.parse.urljoin(job.url, link) if is_crawled_url(link): continue new_job = Job(site=job.site, status=0, url=link, action=job.action + 10) new_job.save() return 2
def atom_spider(site): """ 更新源内容 """ try: resp = requests.get(site.rss, timeout=30, verify=False) except: if site.star >= 9: logger.warning(f"RSS源可能失效了`{site.rss}") else: logger.info(f"RSS源可能失效了`{site.rss}") return None content = BytesIO(resp.content) feed_obj = feedparser.parse(content) for entry in feed_obj.entries[:10]: try: title = entry.title link = entry.link except AttributeError: logger.warning(f'必要属性获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:20] except: author = None try: value = entry.content[0].value except: value = entry.get('description') or entry.link # to absolute image url try: content_soup = BeautifulSoup(value, "html.parser") for img in content_soup.find_all('img'): rel_src = img.attrs.get('src') abs_src = urllib.parse.urljoin(link, rel_src) img.attrs['src'] = abs_src value = str(content_soup) except: logger.warning(f'修复图片路径异常:`{title}`{link}') try: article = Article(site=site, title=title, author=author, src_url=link, uindex=current_ts(), content=value) article.save() mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') except: logger.warning(f'数据插入异常:`{title}`{link}')
def wemp_spider(urls, site): """ 抓取微信内容 :param urls: :param site: :return: """ for url in urls: if is_crawled_url(url): continue try: logger.info(f'开始爬取公众号地址:`{url}') rsp = requests.get(url, timeout=10) if rsp.ok: response = HtmlResponse(url=url, body=rsp.text, encoding='utf8') title = response.selector.xpath( '//h2[@id="activity-name"]/text()').extract_first().strip( ) content = response.selector.xpath( '//div[@id="js_content"]').extract_first().strip() try: author = response.selector.xpath('//span[@id="js_author_name"]/text()').\ extract_first().strip() except: author = response.selector.xpath( '//a[@id="js_name"]/text()').extract_first().strip() if title and content: content_soup = BeautifulSoup(content, "html.parser") for img in content_soup.find_all('img'): if img.attrs.get('data-src'): img.attrs['src'] = img.attrs['data-src'] article = Article(title=title, author=author, site=site, uindex=current_ts(), content=str(content_soup), src_url=url) article.save() mark_crawled_url(url) else: logger.warning(f'公众号内容解析异常:`{title}`{author}`{content}') except (ConnectTimeout, HTTPError, ReadTimeout, Timeout, ConnectionError): logger.warning(f'公众号爬取出现网络异常:`{url}') except: logger.warning(f'公众号爬取出现未知异常:`{url}')
def wemp_spider(url, site): """ 抓取微信内容 :param url: :param site: :return: """ if is_crawled_url(url): return try: rsp = requests.get(url, timeout=10) if rsp.ok: try: if get_host_name(rsp.url) == 'mp.weixin.qq.com': title, author, content = parse_weixin_page(rsp) elif 'ershicimi.com' in get_host_name(rsp.url): title, author, content = parse_ershicimi_page(rsp) else: logger.warning(f'公众号域名解析异常:`{rsp.url}') return except: logger.info(f'公众号内容解析异常:`{rsp.url}') return article = Article(title=title, author=author, site=site, uindex=current_ts(), content=content, src_url=url) article.save() mark_crawled_url(url) except (ConnectTimeout, HTTPError, ReadTimeout, Timeout, ConnectionError): logger.warning(f'公众号爬取出现网络异常:`{url}') except: logger.warning(f'公众号爬取出现未知异常:`{url}')
def make_mpwx_job(site, action): """ 从源地址生成远程任务 """ if action == 10: # RSS 直接解析 feed_obj = feedparser.parse(site.rss) for entry in feed_obj.entries: # 有些是空的 if not entry: continue if not is_crawled_url(entry.link): # 详情页爬虫,分布式执行 job = Job(site=site, action=action + 10, url=entry.link, status=0) job.save() else: # 列表页爬虫,分布式执行 job = Job(site=site, action=action, url=site.rss, status=0) job.save() return True
def wemp_spider(url, site): """ 抓取微信内容,支持直接微信域名或者 ershicimi 域名 :param url: :param site: :return: """ if is_crawled_url(url): return rsp = get_with_proxy(url) if rsp is None: return if rsp.ok: try: if get_host_name(rsp.url) == 'mp.weixin.qq.com': title, author, content = parse_weixin_page(rsp) elif 'ershicimi.com' in get_host_name(rsp.url): title, author, content = parse_ershicimi_page(rsp) else: logger.warning(f'公众号域名解析异常:`{rsp.url}') return except: logger.info(f'公众号内容解析异常:`{rsp.url}') return article = Article(title=title, author=author, site=site, uindex=current_ts(), content=content, src_url=url) article.save() mark_crawled_url(url)
def update_all_user_feed(): """ 更新所有 site """ logger.info('开始运行定时更新RSS任务') now = datetime.datetime.now() # 按照不同频率更新,以 4 小时候为一个大周期 if now.hour % 4 == 0: feeds = Site.objects.filter(status='active', creator='user').order_by('-star') elif now.hour % 4 == 1: feeds = [] elif now.hour % 4 == 2: feeds = Site.objects.filter(status='active', creator='user', star__gte=20).order_by('-star') elif now.hour % 4 == 3: feeds = Site.objects.filter(status='active', creator='user', star__gte=9).order_by('-star') for site in feeds: try: resp = requests.get(site.rss, timeout=30, verify=False) except: if site.star >= 9: logger.warning(f"RSS源可能失效了`{site.rss}") else: logger.info(f"RSS源可能失效了`{site.rss}") continue content = BytesIO(resp.content) feed_obj = feedparser.parse(content) for entry in feed_obj.entries[:10]: try: title = entry.title link = entry.link except AttributeError: logger.warning(f'必要属性获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:11] except: author = None try: value = entry.content[0].value except: value = entry.get('description') or entry.link try: article = Article(site=site, title=title, author=author, src_url=link, uindex=current_ts(), content=value) article.save() mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') except: logger.warning(f'数据插入异常:`{title}`{link}') logger.info('定时更新RSS任务运行结束')
def podcast_spider(site): """ 更新源内容 """ resp = get_with_retry(site.rss) if resp is None: logger.info(f"RSS 源可能失效了`{site.rss}") return None feed_obj = feedparser.parse(BytesIO(resp.content)) for entry in feed_obj.entries: # 有些是空的 if not entry: continue try: title = entry.title except AttributeError: logger.warning(f'title 获取失败:`{site.rss}') continue link = entry.get('link') or entry.get('guid') if not link: logger.warning(f'link 获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:20] except: author = '' audio, img = None, '' if entry.get('links'): for el in entry['links']: if 'audio/' in el.get('type') or el.get('rel') == 'enclosure': audio = el break if entry.get('image'): img = entry.image.get('href') try: brief = entry.content[0].value except: brief = entry.get('description') or entry.link if audio is not None: # 生成 podlove 所需数据 episode = { "version": 5, "show": { "title": site.cname, "subtitle": site.brief, "poster": site.favicon, "link": site.link, }, "title": title, "link": link, # "subtitle": brief, "publicationDate": entry.get('published'), "poster": img, "duration": to_podcast_duration(entry.get('itunes_duration', '')), "audio": [ { "url": audio.href, "mimeType": audio.type } ] } episode = json.dumps(episode) episode = b64encode(bytes(episode, encoding='UTF8')).decode('UTF8') content = podcast_tmpl % episode + brief else: content = brief + f'''<p></p><img src="{img}">''' try: uindex = current_ts() article = Article(site=site, title=title, author=author, src_url=link, uindex=uindex) article.save() write_dat2_file(uindex, site.id, content) mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') mark_crawled_url(link) except: logger.warning(f'数据插入异常:`{title}`{link}') return True
def atom_spider(site): """ 更新源内容 """ resp = get_with_retry(site.rss) if resp is None: if site.star > 9: logger.warning(f"RSS 源可能失效了`{site.rss}") else: logger.info(f"RSS源可能失效了`{site.rss}") return None content = BytesIO(resp.content) feed_obj = feedparser.parse(content) for entry in feed_obj.entries: # 有些是空的 if not entry: continue try: title = entry.title link = entry.link except AttributeError: logger.warning(f'必要属性获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:20] except: author = '' try: value = entry.content[0].value except (AttributeError, IndexError): value = None if not value: value = entry.get('description') or entry.link # to absolute image url try: content_soup = BeautifulSoup(value, "html.parser") for img in content_soup.find_all('img'): rel_src = img.attrs.get('src') abs_src = urllib.parse.urljoin(link, rel_src) img.attrs['src'] = abs_src value = str(content_soup) except: logger.warning(f'修复图片路径异常:`{title}`{link}') try: uindex = current_ts() article = Article(site=site, title=title, author=author, src_url=link, uindex=uindex) article.save() write_dat2_file(uindex, site.id, value) mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') except: logger.warning(f'数据插入异常:`{title}`{link}') set_updated_site(site.pk) return True
def atom_spider(site): """ 更新源内容 """ resp = get_with_retry(site.rss) if resp is None: if site.star > 9: guard_log(f"RSS 源可能失效了`{site.rss}") else: logger.info(f"RSS源可能失效了`{site.rss}") return None content = BytesIO(resp.content) feed_obj = feedparser.parse(content) for entry in feed_obj.entries[:12]: # 有些是空的 if not entry: continue try: title = entry.title link = entry.link except AttributeError: logger.warning(f'必要属性获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:20] except: author = '' try: value = entry.content[0].value except: value = entry.get('description') or entry.link # to absolute image url try: content_soup = BeautifulSoup(value, "html.parser") for img in content_soup.find_all('img'): rel_src = img.attrs.get('src') abs_src = urllib.parse.urljoin(link, rel_src) img.attrs['src'] = abs_src value = str(content_soup) except: logger.warning(f'修复图片路径异常:`{title}`{link}') # 公众号 RSS 二次抓取 if get_host_name(site.rss) in ('qnmlgb.tech', ): if get_host_name(link) in ('mp.weixin.qq.com', ): rsp = get_with_proxy(link) if rsp is not None and rsp.ok: try: title, author, value = parse_weixin_page(rsp) except: pass try: article = Article(site=site, title=title, author=author, src_url=link, uindex=current_ts(), content=value) article.save() mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') except: logger.warning(f'数据插入异常:`{title}`{link}') set_updated_site(site.pk) return True