def update_all_mpwx_cron(): """ 更新微信公众号 """ sites = Site.objects.filter(status='active', creator='wemp').order_by('-star') for site in sites: host, action = get_host_name(site.rss), None if settings.QNMLGB_HOST in host or settings.ANYV_HOST in host or settings.ERSHICIMI_HOST in host: if settings.ERSHICIMI_HOST in host: action = 11 elif settings.QNMLGB_HOST in host: action = 10 elif settings.WEMP_HOST in host: action = 12 elif settings.CHUANSONGME_HOST in host: action = 13 elif settings.ANYV_HOST in host: action = 14 else: logger.warning(f"未知的公众号域名:`{host}`{site.cname}") if action is not None: make_mpwx_job(site, action) return True
def submit_a_feed(request): """ 用户添加一个自定义的订阅源 """ feed_url = request.POST.get('url', '').strip()[:1024] user = get_login_user(request) if feed_url: host = get_host_name(feed_url) if 'ershicimi.com' in host: rsp = parse_wemp_ershicimi(feed_url) elif host in settings.ALLOWED_HOSTS: rsp = parse_self_atom(feed_url) else: rsp = parse_atom(feed_url) if rsp: logger.warning(f"有新订阅源被提交:`{feed_url}") # 已登录用户,自动订阅 if user: add_user_sub_feeds(user.oauth_id, [ rsp['name'], ]) return JsonResponse(rsp) else: logger.warning(f"RSS 解析失败:`{feed_url}") return HttpResponseNotFound("Param error")
def update_sites_async(sites, force_update=False): """ 异步更新某一批订阅源,只支持普通源和公众号的更新 """ for site_id in sites: try: site = Site.objects.get(status='active', pk=site_id) except: continue # 最近已经更新过了,跳过 if not force_update and is_updated_site(site_id): continue if site.creator != 'system': logger.info(f"开始异步更新:{site_id}") host = get_host_name(site.rss) if 'ershicimi.com' in host: parse_wemp_ershicimi(site.rss, update=True) else: atom_spider(site) return True
def submit_a_feed(request): """ 用户添加一个自定义的订阅源 """ feed_url = request.POST.get('url', '').strip()[:1024] user = get_login_user(request) if feed_url: host = get_host_name(feed_url) if settings.ERSHICIMI_HOST in host: feed_url = feed_url.replace('/user/analysis?bid=', '/a/') rsp = add_ershicimi_feed(feed_url) elif host in settings.ALLOWED_HOSTS: rsp = add_self_feed(feed_url) elif settings.QNMLGB_HOST in host: rsp = add_qnmlgb_feed(feed_url) elif settings.WEMP_HOST in host: rsp = add_wemp_feed(feed_url) elif settings.CHUANSONGME_HOST in host: rsp = add_chuansongme_feed(feed_url) elif settings.ANYV_HOST in host: rsp = add_anyv_feed(feed_url) else: # 区分播客还是普通 RSS feed_obj = feedparser.parse(feed_url) if is_podcast_feed(feed_obj): rsp = add_postcast_feed(feed_obj) else: rsp = add_atom_feed(feed_obj) if rsp: logger.warning(f"有新订阅源被提交:`{feed_url}") set_active_site(rsp['site']) # 已登录用户,自动订阅 if user: add_user_sub_feeds(user.oauth_id, [ rsp['site'], ]) if rsp.get('creator') == 'user': # 新增的普通 RSS 才触发异步更新任务 django_rq.enqueue(update_sites_async, [ rsp['site'], ], result_ttl=1, ttl=3600, failure_ttl=3600) return JsonResponse(rsp) else: logger.warning(f"RSS 解析失败:`{feed_url}") return HttpResponseNotFound("Param Error")
def wemp_spider(url, site): """ 抓取微信内容 :param url: :param site: :return: """ if is_crawled_url(url): return try: rsp = requests.get(url, timeout=10) if rsp.ok: try: if get_host_name(rsp.url) == 'mp.weixin.qq.com': title, author, content = parse_weixin_page(rsp) elif 'ershicimi.com' in get_host_name(rsp.url): title, author, content = parse_ershicimi_page(rsp) else: logger.warning(f'公众号域名解析异常:`{rsp.url}') return except: logger.info(f'公众号内容解析异常:`{rsp.url}') return article = Article(title=title, author=author, site=site, uindex=current_ts(), content=content, src_url=url) article.save() mark_crawled_url(url) except (ConnectTimeout, HTTPError, ReadTimeout, Timeout, ConnectionError): logger.warning(f'公众号爬取出现网络异常:`{url}') except: logger.warning(f'公众号爬取出现未知异常:`{url}')
def wemp_spider(url, site): """ 抓取微信内容,支持直接微信域名或者 ershicimi 域名 :param url: :param site: :return: """ if is_crawled_url(url): return rsp = get_with_proxy(url) if rsp is None: return if rsp.ok: try: if get_host_name(rsp.url) == 'mp.weixin.qq.com': title, author, content = parse_weixin_page(rsp) elif 'ershicimi.com' in get_host_name(rsp.url): title, author, content = parse_ershicimi_page(rsp) else: logger.warning(f'公众号域名解析异常:`{rsp.url}') return except: logger.info(f'公众号内容解析异常:`{rsp.url}') return article = Article(title=title, author=author, site=site, uindex=current_ts(), content=content, src_url=url) article.save() mark_crawled_url(url)
def parse_detail_page(job): response = HtmlResponse(url=job.url, body=job.rsp, encoding='utf8') title, author, content = None, None, None # 判断跳转后的域名 host = get_host_name(job.rsp_url) if job.action == 20 or settings.MPWX_HOST in host: try: if response.selector.xpath("//div[@class='weui-msg__text-area']").extract_first(): mark_crawled_url(job.url, job.rsp_url) logger.info(f"内容违规或删除:`{job.url}") return 6 except: pass title, author, content = parse_mpwx_detail_page(response) if job.action != 20 and settings.MPWX_HOST in host: logger.info(f"跳转到微信原文:`{job.url}`{job.rsp_url}`{title}") elif job.action == 21: title, author, content = parse_ershicimi_detail_page(response) elif job.action == 22: title, author, content = parse_wemp_detail_page(response) elif job.action == 23: title, author, content = parse_chuansongme_detail_page(response) elif job.action == 24: title, author, content = parse_anyv_detail_page(response) mark_crawled_url(job.url, job.rsp_url) if title is None: logger.warning(f"页面解析失败:`{title}`{job.url}") return 4 else: try: uindex = current_ts() article = Article(title=title, author=author, site=job.site, uindex=uindex, src_url=job.url) article.save() write_dat2_file(uindex, job.site_id, content) except: logger.warning(f"插入文章异常:`{title}`{job.site}`{job.url}") return 7 return 2
def submit_a_feed(request): """ 用户添加一个自定义的订阅源 """ feed_url = request.POST.get('url', '').strip()[:1024] user = get_login_user(request) if feed_url: host = get_host_name(feed_url) if 'ershicimi.com' in host: feed_url = feed_url.replace('/user/analysis?bid=', '/a/') rsp = parse_wemp_ershicimi(feed_url) elif host in settings.ALLOWED_HOSTS: rsp = parse_self_atom(feed_url) elif 'qnmlgb.tech' in host: rsp = parse_qnmlgb_atom(feed_url) else: rsp = parse_atom(feed_url) if rsp: logger.warning(f"有新订阅源被提交:`{feed_url}") # 已登录用户,自动订阅 if user: add_user_sub_feeds(user.oauth_id, [ rsp['site'], ]) # 异步更新任务 django_rq.enqueue(update_sites_async, [ rsp['site'], ]) return JsonResponse(rsp) else: logger.warning(f"RSS 解析失败:`{feed_url}") return HttpResponseNotFound("Param Error")
def update_all_wemp_cron(): """ 更新微信公众号,每天 1~2 次 """ sites = Site.objects.filter(status='active', creator='wemp').order_by('-star') for site in sites: # 无人订阅的源且不推荐的源不更新 if not is_active_rss(site.pk) and site.star < 9: continue if not is_updated_site(site.pk): host = get_host_name(site.rss) if 'ershicimi.com' in host: parse_wemp_ershicimi(site.rss, update=True) elif 'qnmlgb.tech' in host: atom_spider(site) else: pass return True
def atom_spider(site): """ 更新源内容 """ resp = get_with_retry(site.rss) if resp is None: if site.star > 9: guard_log(f"RSS 源可能失效了`{site.rss}") else: logger.info(f"RSS源可能失效了`{site.rss}") return None content = BytesIO(resp.content) feed_obj = feedparser.parse(content) for entry in feed_obj.entries[:12]: # 有些是空的 if not entry: continue try: title = entry.title link = entry.link except AttributeError: logger.warning(f'必要属性获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:20] except: author = '' try: value = entry.content[0].value except: value = entry.get('description') or entry.link # to absolute image url try: content_soup = BeautifulSoup(value, "html.parser") for img in content_soup.find_all('img'): rel_src = img.attrs.get('src') abs_src = urllib.parse.urljoin(link, rel_src) img.attrs['src'] = abs_src value = str(content_soup) except: logger.warning(f'修复图片路径异常:`{title}`{link}') # 公众号 RSS 二次抓取 if get_host_name(site.rss) in ('qnmlgb.tech', ): if get_host_name(link) in ('mp.weixin.qq.com', ): rsp = get_with_proxy(link) if rsp is not None and rsp.ok: try: title, author, value = parse_weixin_page(rsp) except: pass try: article = Article(site=site, title=title, author=author, src_url=link, uindex=current_ts(), content=value) article.save() mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') except: logger.warning(f'数据插入异常:`{title}`{link}') set_updated_site(site.pk) return True
def atom_spider(site): """ 更新源内容 """ try: resp = requests.get(site.rss, timeout=30, verify=False) except: if site.star > 9: logger.warning(f"RSS源可能失效了`{site.rss}") else: logger.info(f"RSS源可能失效了`{site.rss}") return None content = BytesIO(resp.content) feed_obj = feedparser.parse(content) for entry in feed_obj.entries[:12]: try: title = entry.title link = entry.link except AttributeError: logger.warning(f'必要属性获取失败:`{site.rss}') continue if is_crawled_url(link): continue try: author = entry['author'][:20] except: author = None try: value = entry.content[0].value except: value = entry.get('description') or entry.link # to absolute image url try: content_soup = BeautifulSoup(value, "html.parser") for img in content_soup.find_all('img'): rel_src = img.attrs.get('src') abs_src = urllib.parse.urljoin(link, rel_src) img.attrs['src'] = abs_src value = str(content_soup) except: logger.warning(f'修复图片路径异常:`{title}`{link}') # 公众号 RSS 二次抓取 if get_host_name(site.rss) in ('qnmlgb.tech', ): if get_host_name(link) in ('mp.weixin.qq.com', ): try: rsp = requests.get(link, timeout=10) title, author, value = parse_weixin_page(rsp) except (ConnectTimeout, HTTPError, ReadTimeout, Timeout, ConnectionError): logger.warning(f"公众号二次爬取出现网络异常:`{link}") except: logger.warning(f"公众号二次爬取出现未知异常:`{link}") try: article = Article(site=site, title=title, author=author, src_url=link, uindex=current_ts(), content=value) article.save() mark_crawled_url(link) except django.db.utils.IntegrityError: logger.info(f'数据重复插入:`{title}`{link}') except: logger.warning(f'数据插入异常:`{title}`{link}')