Пример #1
0
def update_all_mpwx_cron():
    """
    更新微信公众号
    """
    sites = Site.objects.filter(status='active',
                                creator='wemp').order_by('-star')

    for site in sites:
        host, action = get_host_name(site.rss), None

        if settings.QNMLGB_HOST in host or settings.ANYV_HOST in host or settings.ERSHICIMI_HOST in host:
            if settings.ERSHICIMI_HOST in host:
                action = 11
            elif settings.QNMLGB_HOST in host:
                action = 10
            elif settings.WEMP_HOST in host:
                action = 12
            elif settings.CHUANSONGME_HOST in host:
                action = 13
            elif settings.ANYV_HOST in host:
                action = 14
            else:
                logger.warning(f"未知的公众号域名:`{host}`{site.cname}")

            if action is not None:
                make_mpwx_job(site, action)

    return True
Пример #2
0
def submit_a_feed(request):
    """
    用户添加一个自定义的订阅源
    """
    feed_url = request.POST.get('url', '').strip()[:1024]
    user = get_login_user(request)

    if feed_url:
        host = get_host_name(feed_url)

        if 'ershicimi.com' in host:
            rsp = parse_wemp_ershicimi(feed_url)
        elif host in settings.ALLOWED_HOSTS:
            rsp = parse_self_atom(feed_url)
        else:
            rsp = parse_atom(feed_url)

        if rsp:
            logger.warning(f"有新订阅源被提交:`{feed_url}")

            # 已登录用户,自动订阅
            if user:
                add_user_sub_feeds(user.oauth_id, [
                    rsp['name'],
                ])
            return JsonResponse(rsp)
        else:
            logger.warning(f"RSS 解析失败:`{feed_url}")

    return HttpResponseNotFound("Param error")
Пример #3
0
def update_sites_async(sites, force_update=False):
    """
    异步更新某一批订阅源,只支持普通源和公众号的更新
    """
    for site_id in sites:
        try:
            site = Site.objects.get(status='active', pk=site_id)
        except:
            continue

        # 最近已经更新过了,跳过
        if not force_update and is_updated_site(site_id):
            continue

        if site.creator != 'system':
            logger.info(f"开始异步更新:{site_id}")

            host = get_host_name(site.rss)

            if 'ershicimi.com' in host:
                parse_wemp_ershicimi(site.rss, update=True)
            else:
                atom_spider(site)

    return True
Пример #4
0
def submit_a_feed(request):
    """
    用户添加一个自定义的订阅源
    """
    feed_url = request.POST.get('url', '').strip()[:1024]
    user = get_login_user(request)

    if feed_url:
        host = get_host_name(feed_url)

        if settings.ERSHICIMI_HOST in host:
            feed_url = feed_url.replace('/user/analysis?bid=', '/a/')
            rsp = add_ershicimi_feed(feed_url)
        elif host in settings.ALLOWED_HOSTS:
            rsp = add_self_feed(feed_url)
        elif settings.QNMLGB_HOST in host:
            rsp = add_qnmlgb_feed(feed_url)
        elif settings.WEMP_HOST in host:
            rsp = add_wemp_feed(feed_url)
        elif settings.CHUANSONGME_HOST in host:
            rsp = add_chuansongme_feed(feed_url)
        elif settings.ANYV_HOST in host:
            rsp = add_anyv_feed(feed_url)
        else:
            # 区分播客还是普通 RSS
            feed_obj = feedparser.parse(feed_url)

            if is_podcast_feed(feed_obj):
                rsp = add_postcast_feed(feed_obj)
            else:
                rsp = add_atom_feed(feed_obj)

        if rsp:
            logger.warning(f"有新订阅源被提交:`{feed_url}")

            set_active_site(rsp['site'])

            # 已登录用户,自动订阅
            if user:
                add_user_sub_feeds(user.oauth_id, [
                    rsp['site'],
                ])

            if rsp.get('creator') == 'user':
                # 新增的普通 RSS 才触发异步更新任务
                django_rq.enqueue(update_sites_async, [
                    rsp['site'],
                ],
                                  result_ttl=1,
                                  ttl=3600,
                                  failure_ttl=3600)

            return JsonResponse(rsp)
        else:
            logger.warning(f"RSS 解析失败:`{feed_url}")

    return HttpResponseNotFound("Param Error")
Пример #5
0
def wemp_spider(url, site):
    """
    抓取微信内容
    :param url:
    :param site:
    :return:
    """
    if is_crawled_url(url):
        return

    try:
        rsp = requests.get(url, timeout=10)

        if rsp.ok:
            try:
                if get_host_name(rsp.url) == 'mp.weixin.qq.com':
                    title, author, content = parse_weixin_page(rsp)
                elif 'ershicimi.com' in get_host_name(rsp.url):
                    title, author, content = parse_ershicimi_page(rsp)
                else:
                    logger.warning(f'公众号域名解析异常:`{rsp.url}')
                    return
            except:
                logger.info(f'公众号内容解析异常:`{rsp.url}')
                return

            article = Article(title=title,
                              author=author,
                              site=site,
                              uindex=current_ts(),
                              content=content,
                              src_url=url)
            article.save()

            mark_crawled_url(url)
    except (ConnectTimeout, HTTPError, ReadTimeout, Timeout, ConnectionError):
        logger.warning(f'公众号爬取出现网络异常:`{url}')
    except:
        logger.warning(f'公众号爬取出现未知异常:`{url}')
Пример #6
0
def wemp_spider(url, site):
    """
    抓取微信内容,支持直接微信域名或者 ershicimi 域名
    :param url:
    :param site:
    :return:
    """
    if is_crawled_url(url):
        return

    rsp = get_with_proxy(url)
    if rsp is None:
        return

    if rsp.ok:
        try:
            if get_host_name(rsp.url) == 'mp.weixin.qq.com':
                title, author, content = parse_weixin_page(rsp)
            elif 'ershicimi.com' in get_host_name(rsp.url):
                title, author, content = parse_ershicimi_page(rsp)
            else:
                logger.warning(f'公众号域名解析异常:`{rsp.url}')
                return
        except:
            logger.info(f'公众号内容解析异常:`{rsp.url}')
            return

        article = Article(title=title,
                          author=author,
                          site=site,
                          uindex=current_ts(),
                          content=content,
                          src_url=url)
        article.save()

        mark_crawled_url(url)
Пример #7
0
def parse_detail_page(job):
    response = HtmlResponse(url=job.url, body=job.rsp, encoding='utf8')
    title, author, content = None, None, None

    # 判断跳转后的域名
    host = get_host_name(job.rsp_url)

    if job.action == 20 or settings.MPWX_HOST in host:
        try:
            if response.selector.xpath("//div[@class='weui-msg__text-area']").extract_first():
                mark_crawled_url(job.url, job.rsp_url)
                logger.info(f"内容违规或删除:`{job.url}")
                return 6
        except:
            pass

        title, author, content = parse_mpwx_detail_page(response)

        if job.action != 20 and settings.MPWX_HOST in host:
            logger.info(f"跳转到微信原文:`{job.url}`{job.rsp_url}`{title}")

    elif job.action == 21:
        title, author, content = parse_ershicimi_detail_page(response)
    elif job.action == 22:
        title, author, content = parse_wemp_detail_page(response)
    elif job.action == 23:
        title, author, content = parse_chuansongme_detail_page(response)
    elif job.action == 24:
        title, author, content = parse_anyv_detail_page(response)

    mark_crawled_url(job.url, job.rsp_url)

    if title is None:
        logger.warning(f"页面解析失败:`{title}`{job.url}")
        return 4
    else:
        try:
            uindex = current_ts()

            article = Article(title=title, author=author, site=job.site, uindex=uindex, src_url=job.url)
            article.save()

            write_dat2_file(uindex, job.site_id, content)
        except:
            logger.warning(f"插入文章异常:`{title}`{job.site}`{job.url}")
            return 7

        return 2
Пример #8
0
def submit_a_feed(request):
    """
    用户添加一个自定义的订阅源
    """
    feed_url = request.POST.get('url', '').strip()[:1024]
    user = get_login_user(request)

    if feed_url:
        host = get_host_name(feed_url)

        if 'ershicimi.com' in host:
            feed_url = feed_url.replace('/user/analysis?bid=', '/a/')
            rsp = parse_wemp_ershicimi(feed_url)
        elif host in settings.ALLOWED_HOSTS:
            rsp = parse_self_atom(feed_url)
        elif 'qnmlgb.tech' in host:
            rsp = parse_qnmlgb_atom(feed_url)
        else:
            rsp = parse_atom(feed_url)

        if rsp:
            logger.warning(f"有新订阅源被提交:`{feed_url}")

            # 已登录用户,自动订阅
            if user:
                add_user_sub_feeds(user.oauth_id, [
                    rsp['site'],
                ])

            # 异步更新任务
            django_rq.enqueue(update_sites_async, [
                rsp['site'],
            ])

            return JsonResponse(rsp)
        else:
            logger.warning(f"RSS 解析失败:`{feed_url}")

    return HttpResponseNotFound("Param Error")
Пример #9
0
def update_all_wemp_cron():
    """
    更新微信公众号,每天 1~2 次
    """
    sites = Site.objects.filter(status='active',
                                creator='wemp').order_by('-star')

    for site in sites:
        # 无人订阅的源且不推荐的源不更新
        if not is_active_rss(site.pk) and site.star < 9:
            continue

        if not is_updated_site(site.pk):
            host = get_host_name(site.rss)

            if 'ershicimi.com' in host:
                parse_wemp_ershicimi(site.rss, update=True)
            elif 'qnmlgb.tech' in host:
                atom_spider(site)
            else:
                pass

    return True
Пример #10
0
def atom_spider(site):
    """
    更新源内容
    """
    resp = get_with_retry(site.rss)

    if resp is None:
        if site.star > 9:
            guard_log(f"RSS 源可能失效了`{site.rss}")
        else:
            logger.info(f"RSS源可能失效了`{site.rss}")
        return None

    content = BytesIO(resp.content)
    feed_obj = feedparser.parse(content)

    for entry in feed_obj.entries[:12]:
        # 有些是空的
        if not entry:
            continue

        try:
            title = entry.title
            link = entry.link
        except AttributeError:
            logger.warning(f'必要属性获取失败:`{site.rss}')
            continue

        if is_crawled_url(link):
            continue

        try:
            author = entry['author'][:20]
        except:
            author = ''

        try:
            value = entry.content[0].value
        except:
            value = entry.get('description') or entry.link

        # to absolute image url
        try:
            content_soup = BeautifulSoup(value, "html.parser")

            for img in content_soup.find_all('img'):
                rel_src = img.attrs.get('src')
                abs_src = urllib.parse.urljoin(link, rel_src)
                img.attrs['src'] = abs_src

            value = str(content_soup)
        except:
            logger.warning(f'修复图片路径异常:`{title}`{link}')

        # 公众号 RSS 二次抓取
        if get_host_name(site.rss) in ('qnmlgb.tech', ):
            if get_host_name(link) in ('mp.weixin.qq.com', ):
                rsp = get_with_proxy(link)

                if rsp is not None and rsp.ok:
                    try:
                        title, author, value = parse_weixin_page(rsp)
                    except:
                        pass
        try:
            article = Article(site=site,
                              title=title,
                              author=author,
                              src_url=link,
                              uindex=current_ts(),
                              content=value)
            article.save()

            mark_crawled_url(link)
        except django.db.utils.IntegrityError:
            logger.info(f'数据重复插入:`{title}`{link}')
        except:
            logger.warning(f'数据插入异常:`{title}`{link}')

    set_updated_site(site.pk)
    return True
Пример #11
0
def atom_spider(site):
    """
    更新源内容
    """
    try:
        resp = requests.get(site.rss, timeout=30, verify=False)
    except:
        if site.star > 9:
            logger.warning(f"RSS源可能失效了`{site.rss}")
        else:
            logger.info(f"RSS源可能失效了`{site.rss}")
        return None

    content = BytesIO(resp.content)
    feed_obj = feedparser.parse(content)

    for entry in feed_obj.entries[:12]:
        try:
            title = entry.title
            link = entry.link
        except AttributeError:
            logger.warning(f'必要属性获取失败:`{site.rss}')
            continue

        if is_crawled_url(link):
            continue

        try:
            author = entry['author'][:20]
        except:
            author = None

        try:
            value = entry.content[0].value
        except:
            value = entry.get('description') or entry.link

        # to absolute image url
        try:
            content_soup = BeautifulSoup(value, "html.parser")

            for img in content_soup.find_all('img'):
                rel_src = img.attrs.get('src')
                abs_src = urllib.parse.urljoin(link, rel_src)
                img.attrs['src'] = abs_src

            value = str(content_soup)
        except:
            logger.warning(f'修复图片路径异常:`{title}`{link}')

        # 公众号 RSS 二次抓取
        if get_host_name(site.rss) in ('qnmlgb.tech', ):
            if get_host_name(link) in ('mp.weixin.qq.com', ):
                try:
                    rsp = requests.get(link, timeout=10)
                    title, author, value = parse_weixin_page(rsp)
                except (ConnectTimeout, HTTPError, ReadTimeout, Timeout, ConnectionError):
                    logger.warning(f"公众号二次爬取出现网络异常:`{link}")
                except:
                    logger.warning(f"公众号二次爬取出现未知异常:`{link}")

        try:
            article = Article(site=site, title=title, author=author, src_url=link, uindex=current_ts(), content=value)
            article.save()

            mark_crawled_url(link)
        except django.db.utils.IntegrityError:
            logger.info(f'数据重复插入:`{title}`{link}')
        except:
            logger.warning(f'数据插入异常:`{title}`{link}')