Пример #1
0
def wemp_spider(urls, site):
    """
    抓取微信内容
    :param urls:
    :param site:
    :return:
    """
    for url in urls:
        if is_crawled_url(url):
            continue

        try:
            logger.info(f'开始爬取公众号地址:`{url}')
            rsp = requests.get(url, timeout=10)

            if rsp.ok:
                response = HtmlResponse(url=url,
                                        body=rsp.text,
                                        encoding='utf8')

                title = response.selector.xpath(
                    '//h2[@id="activity-name"]/text()').extract_first().strip(
                    )
                content = response.selector.xpath(
                    '//div[@id="js_content"]').extract_first().strip()

                try:
                    author = response.selector.xpath('//span[@id="js_author_name"]/text()').\
                        extract_first().strip()
                except:
                    author = response.selector.xpath(
                        '//a[@id="js_name"]/text()').extract_first().strip()

                if title and content:
                    content_soup = BeautifulSoup(content, "html.parser")
                    for img in content_soup.find_all('img'):
                        if img.attrs.get('data-src'):
                            img.attrs['src'] = img.attrs['data-src']

                    article = Article(title=title,
                                      author=author,
                                      site=site,
                                      uindex=current_ts(),
                                      content=str(content_soup),
                                      src_url=url)
                    article.save()

                    mark_crawled_url(url)
                else:
                    logger.warning(f'公众号内容解析异常:`{title}`{author}`{content}')
        except (ConnectTimeout, HTTPError, ReadTimeout, Timeout,
                ConnectionError):
            logger.warning(f'公众号爬取出现网络异常:`{url}')
        except:
            logger.warning(f'公众号爬取出现未知异常:`{url}')
Пример #2
0
def parse_detail_page(job):
    response = HtmlResponse(url=job.url, body=job.rsp, encoding='utf8')
    title, author, content = None, None, None

    # 判断跳转后的域名
    host = get_host_name(job.rsp_url)

    if job.action == 20 or settings.MPWX_HOST in host:
        try:
            if response.selector.xpath("//div[@class='weui-msg__text-area']").extract_first():
                mark_crawled_url(job.url, job.rsp_url)
                logger.info(f"内容违规或删除:`{job.url}")
                return 6
        except:
            pass

        title, author, content = parse_mpwx_detail_page(response)

        if job.action != 20 and settings.MPWX_HOST in host:
            logger.info(f"跳转到微信原文:`{job.url}`{job.rsp_url}`{title}")

    elif job.action == 21:
        title, author, content = parse_ershicimi_detail_page(response)
    elif job.action == 22:
        title, author, content = parse_wemp_detail_page(response)
    elif job.action == 23:
        title, author, content = parse_chuansongme_detail_page(response)
    elif job.action == 24:
        title, author, content = parse_anyv_detail_page(response)

    mark_crawled_url(job.url, job.rsp_url)

    if title is None:
        logger.warning(f"页面解析失败:`{title}`{job.url}")
        return 4
    else:
        try:
            uindex = current_ts()

            article = Article(title=title, author=author, site=job.site, uindex=uindex, src_url=job.url)
            article.save()

            write_dat2_file(uindex, job.site_id, content)
        except:
            logger.warning(f"插入文章异常:`{title}`{job.site}`{job.url}")
            return 7

        return 2
Пример #3
0
def wemp_spider(url, site):
    """
    抓取微信内容
    :param url:
    :param site:
    :return:
    """
    if is_crawled_url(url):
        return

    try:
        rsp = requests.get(url, timeout=10)

        if rsp.ok:
            try:
                if get_host_name(rsp.url) == 'mp.weixin.qq.com':
                    title, author, content = parse_weixin_page(rsp)
                elif 'ershicimi.com' in get_host_name(rsp.url):
                    title, author, content = parse_ershicimi_page(rsp)
                else:
                    logger.warning(f'公众号域名解析异常:`{rsp.url}')
                    return
            except:
                logger.info(f'公众号内容解析异常:`{rsp.url}')
                return

            article = Article(title=title,
                              author=author,
                              site=site,
                              uindex=current_ts(),
                              content=content,
                              src_url=url)
            article.save()

            mark_crawled_url(url)
    except (ConnectTimeout, HTTPError, ReadTimeout, Timeout, ConnectionError):
        logger.warning(f'公众号爬取出现网络异常:`{url}')
    except:
        logger.warning(f'公众号爬取出现未知异常:`{url}')
Пример #4
0
def wemp_spider(url, site):
    """
    抓取微信内容,支持直接微信域名或者 ershicimi 域名
    :param url:
    :param site:
    :return:
    """
    if is_crawled_url(url):
        return

    rsp = get_with_proxy(url)
    if rsp is None:
        return

    if rsp.ok:
        try:
            if get_host_name(rsp.url) == 'mp.weixin.qq.com':
                title, author, content = parse_weixin_page(rsp)
            elif 'ershicimi.com' in get_host_name(rsp.url):
                title, author, content = parse_ershicimi_page(rsp)
            else:
                logger.warning(f'公众号域名解析异常:`{rsp.url}')
                return
        except:
            logger.info(f'公众号内容解析异常:`{rsp.url}')
            return

        article = Article(title=title,
                          author=author,
                          site=site,
                          uindex=current_ts(),
                          content=content,
                          src_url=url)
        article.save()

        mark_crawled_url(url)
Пример #5
0
def update_all_user_feed():
    """
    更新所有 site
    """
    logger.info('开始运行定时更新RSS任务')

    now = datetime.datetime.now()

    # 按照不同频率更新,以 4 小时候为一个大周期
    if now.hour % 4 == 0:
        feeds = Site.objects.filter(status='active',
                                    creator='user').order_by('-star')
    elif now.hour % 4 == 1:
        feeds = []
    elif now.hour % 4 == 2:
        feeds = Site.objects.filter(status='active',
                                    creator='user',
                                    star__gte=20).order_by('-star')
    elif now.hour % 4 == 3:
        feeds = Site.objects.filter(status='active',
                                    creator='user',
                                    star__gte=9).order_by('-star')

    for site in feeds:
        try:
            resp = requests.get(site.rss, timeout=30, verify=False)
        except:
            if site.star >= 9:
                logger.warning(f"RSS源可能失效了`{site.rss}")
            else:
                logger.info(f"RSS源可能失效了`{site.rss}")
            continue

        content = BytesIO(resp.content)
        feed_obj = feedparser.parse(content)

        for entry in feed_obj.entries[:10]:
            try:
                title = entry.title
                link = entry.link
            except AttributeError:
                logger.warning(f'必要属性获取失败:`{site.rss}')
                continue

            if is_crawled_url(link):
                continue

            try:
                author = entry['author'][:11]
            except:
                author = None

            try:
                value = entry.content[0].value
            except:
                value = entry.get('description') or entry.link

            try:
                article = Article(site=site,
                                  title=title,
                                  author=author,
                                  src_url=link,
                                  uindex=current_ts(),
                                  content=value)
                article.save()
                mark_crawled_url(link)
            except django.db.utils.IntegrityError:
                logger.info(f'数据重复插入:`{title}`{link}')
            except:
                logger.warning(f'数据插入异常:`{title}`{link}')
    logger.info('定时更新RSS任务运行结束')
Пример #6
0
def build_whoosh_index_cron():
    """
    建立全文搜索索引
    """
    from web.utils import whoosh_site_schema, whoosh_article_schema
    from whoosh.filedb.filestore import FileStorage
    from whoosh.qparser import QueryParser

    idx_dir = settings.WHOOSH_IDX_DIR
    first_boot = False

    if not os.path.exists(idx_dir):
        os.makedirs(idx_dir)
        first_boot = True

    storage = FileStorage(idx_dir)

    # 索引站点
    if first_boot:
        idx = storage.create_index(whoosh_site_schema, indexname="site")
    else:
        idx = storage.open_index(indexname="site", schema=whoosh_site_schema)

    idx_writer = idx.writer()

    for site_id in get_active_sites():
        # 判断是否已经索引
        if is_indexed('site', site_id) and not first_boot:
            continue

        try:
            site = Site.objects.get(pk=site_id, status='active')
        except:
            continue

        cname = split_cn_words(site.cname, join=True)
        author = site.author or ''
        brief = split_cn_words(site.brief, join=True)

        logger.info(f"源分词结果:`{site_id}`{cname}`{brief}")

        try:
            idx_writer.add_document(id=site_id,
                                    cname=cname,
                                    author=author,
                                    brief=brief)
            set_indexed('site', site_id)
        except:
            logger.warning(f"源索引失败:`{site_id}")
    idx_writer.commit()

    # 索引文章
    if first_boot:
        idx = storage.create_index(whoosh_article_schema, indexname="article")
    else:
        idx = storage.open_index(indexname="article",
                                 schema=whoosh_article_schema)

    idx_writer = idx.writer()

    for uindex in get_recent_articles():
        # 判断是否已经索引
        if is_indexed('article', uindex) and not first_boot:
            continue

        try:
            article = Article.objects.get(uindex=uindex, status='active')
        except:
            continue

        content = get_content(uindex, article.site_id)

        if content:
            title = split_cn_words(article.title, join=True)
            author = article.author or ''

            content_soup = BeautifulSoup(content, 'html.parser')
            content = split_cn_words(content_soup.get_text(),
                                     join=True,
                                     limit=20)

            logger.info(f"文章分词结果:`{uindex}`{title}")

            try:
                idx_writer.add_document(uindex=uindex,
                                        title=title,
                                        author=author,
                                        content=content)
                set_indexed('article', uindex)
            except:
                logger.warning(f"文章索引失败:`{uindex}")
    idx_writer.commit()

    # 清理过期文章
    idx = storage.open_index(indexname="article", schema=whoosh_article_schema)
    idx_writer = idx.writer()

    lastweek_ts = str(current_ts() - 7 * 86400 * 1000)
    query = QueryParser("uindex",
                        idx.schema).parse('uindex:{to %s]' % lastweek_ts)

    with idx.searcher() as searcher:
        idx_writer.delete_by_query(query, searcher)
        idx_writer.commit()

    return True
Пример #7
0
def in_site_search(request):
    """
    站内搜索
    """
    user = get_login_user(request)
    keyword = request.POST.get('keyword', '').strip()
    scope = request.POST.get('scope', 'all')

    logger.warning(f"搜索关键字:`{keyword}")
    keyword = split_cn_words(keyword, join=True)
    logger.info(f"转换后的关键字:`{keyword}")

    if scope not in ('all', 'feed', 'article'):
        return HttpResponseForbidden('Param Error')

    if not keyword:
        return HttpResponseNotFound("Empty Keyword")

    storage = FileStorage(settings.WHOOSH_IDX_DIR)
    rel_sites, rel_articles = None, None

    # 查找相关源
    if scope in ('feed', 'all'):
        idx = storage.open_index(indexname="site", schema=whoosh_site_schema)
        qp = MultifieldParser(['cname', 'author', 'brief'],
                              schema=whoosh_site_schema)
        query = qp.parse(keyword)
        sites = []

        with idx.searcher() as s:
            results = s.search(query, limit=50)

            for ret in results:
                sites.append(ret['id'])

        rel_sites = Site.objects.filter(status='active',
                                        pk__in=sites).order_by('-star')
    elif scope == 'article':
        # 查找相关文章
        idx = storage.open_index(indexname="article",
                                 schema=whoosh_article_schema)
        qp = MultifieldParser(['title', 'author', 'content'],
                              schema=whoosh_article_schema)
        query = qp.parse(keyword)
        articles = []

        with idx.searcher() as s:
            old_mask = TermRange("uindex", None,
                                 str(current_ts() - 7 * 86400 * 1000))
            results = s.search(query, mask=old_mask, limit=50)

            for ret in results:
                articles.append(ret['uindex'])
        rel_articles = Article.objects.filter(is_recent=True,
                                              status='active',
                                              uindex__in=articles).iterator()

    # 用户订阅
    user_sub_feeds = []
    if user:
        user_sub_feeds = get_user_subscribe_feeds(user.oauth_id,
                                                  user_level=user.level)

    context = dict()
    context['user'] = user
    context['user_sub_feeds'] = user_sub_feeds
    context['rel_sites'] = rel_sites
    context['rel_articles'] = rel_articles
    context['keyword'] = keyword

    if scope == 'all':
        return render(request, 'search/search.html', context=context)
    elif scope == 'feed':
        return render(request, 'search/search_feeds.html', context=context)
    elif scope == 'article':
        return render(request, 'search/search_articles.html', context=context)
Пример #8
0
def podcast_spider(site):
    """
    更新源内容
    """
    resp = get_with_retry(site.rss)

    if resp is None:
        logger.info(f"RSS 源可能失效了`{site.rss}")
        return None

    feed_obj = feedparser.parse(BytesIO(resp.content))

    for entry in feed_obj.entries:
        # 有些是空的
        if not entry:
            continue

        try:
            title = entry.title
        except AttributeError:
            logger.warning(f'title 获取失败:`{site.rss}')
            continue

        link = entry.get('link') or entry.get('guid')
        if not link:
            logger.warning(f'link 获取失败:`{site.rss}')
            continue

        if is_crawled_url(link):
            continue

        try:
            author = entry['author'][:20]
        except:
            author = ''

        audio, img = None, ''
        if entry.get('links'):
            for el in entry['links']:
                if 'audio/' in el.get('type') or el.get('rel') == 'enclosure':
                    audio = el
                    break

        if entry.get('image'):
            img = entry.image.get('href')

        try:
            brief = entry.content[0].value
        except:
            brief = entry.get('description') or entry.link

        if audio is not None:
            # 生成 podlove 所需数据
            episode = {
                "version": 5,
                "show": {
                    "title": site.cname,
                    "subtitle": site.brief,
                    "poster": site.favicon,
                    "link": site.link,
                },
                "title": title,
                "link": link,
                # "subtitle": brief,
                "publicationDate": entry.get('published'),
                "poster": img,
                "duration": to_podcast_duration(entry.get('itunes_duration', '')),
                "audio": [
                    {
                        "url": audio.href,
                        "mimeType": audio.type
                    }
                ]
            }
            episode = json.dumps(episode)
            episode = b64encode(bytes(episode, encoding='UTF8')).decode('UTF8')
            content = podcast_tmpl % episode + brief
        else:
            content = brief + f'''<p></p><img src="{img}">'''

        try:
            uindex = current_ts()

            article = Article(site=site, title=title, author=author, src_url=link, uindex=uindex)
            article.save()

            write_dat2_file(uindex, site.id, content)

            mark_crawled_url(link)
        except django.db.utils.IntegrityError:
            logger.info(f'数据重复插入:`{title}`{link}')
            mark_crawled_url(link)
        except:
            logger.warning(f'数据插入异常:`{title}`{link}')

    return True
Пример #9
0
def atom_spider(site):
    """
    更新源内容
    """
    resp = get_with_retry(site.rss)

    if resp is None:
        if site.star > 9:
            logger.warning(f"RSS 源可能失效了`{site.rss}")
        else:
            logger.info(f"RSS源可能失效了`{site.rss}")
        return None

    content = BytesIO(resp.content)
    feed_obj = feedparser.parse(content)

    for entry in feed_obj.entries:
        # 有些是空的
        if not entry:
            continue

        try:
            title = entry.title
            link = entry.link
        except AttributeError:
            logger.warning(f'必要属性获取失败:`{site.rss}')
            continue

        if is_crawled_url(link):
            continue

        try:
            author = entry['author'][:20]
        except:
            author = ''

        try:
            value = entry.content[0].value
        except (AttributeError, IndexError):
            value = None

        if not value:
            value = entry.get('description') or entry.link

        # to absolute image url
        try:
            content_soup = BeautifulSoup(value, "html.parser")

            for img in content_soup.find_all('img'):
                rel_src = img.attrs.get('src')
                abs_src = urllib.parse.urljoin(link, rel_src)
                img.attrs['src'] = abs_src

            value = str(content_soup)
        except:
            logger.warning(f'修复图片路径异常:`{title}`{link}')

        try:
            uindex = current_ts()

            article = Article(site=site, title=title, author=author, src_url=link, uindex=uindex)
            article.save()

            write_dat2_file(uindex, site.id, value)

            mark_crawled_url(link)
        except django.db.utils.IntegrityError:
            logger.info(f'数据重复插入:`{title}`{link}')
        except:
            logger.warning(f'数据插入异常:`{title}`{link}')

    set_updated_site(site.pk)
    return True
Пример #10
0
    def handle(self, *args, **kwargs):
        time = timezone.now().strftime('%X')
        self.stdout.write("It's now %s" % time)
        logger.info("the job sheduler stared")

        logger.info('开始运行定时更新RSS任务')
        now = datetime.now()
        if now.hour % 4 == 0:
            feeds = Site.objects.filter(status='active',
                                        creator='user').order_by('-star')
        elif now.hour % 4 == 1:
            feeds = feeds = Site.objects.filter(status='active',
                                                creator='user',
                                                star__gte=50).order_by('-star')
        elif now.hour % 4 == 2:
            feeds = Site.objects.filter(status='active',
                                        creator='user',
                                        star__gte=20).order_by('-star')
        elif now.hour % 4 == 3:
            feeds = Site.objects.filter(status='active',
                                        creator='user',
                                        star__gte=9).order_by('-star')

        feeds = Site.objects.filter(status='active',
                                    creator='user',
                                    star__gte=9).order_by('-star')

        for site in feeds:

            logger.info(f"RSS源`{site.rss}")

            feed_obj = feedparser.parse(site.rss)
            logger.info('定时更新RSS任务运行结束')

            for entry in feed_obj.entries[:10]:
                try:
                    title = entry.title
                    link = entry.link
                    #logger.info(f"RSS源`{title}")
                except AttributeError:
                    logger.warning(f'必要属性获取失败:`{site.rss}')
                    continue

        # if is_crawled_url(link):
        #     continue

                try:
                    author = entry['author'][:11]
                    logger.info(f"RSS源数据author`{author}")
                except:
                    author = None
                    #logger.info(f"RSS源数据author`{author}")

                try:
                    value = entry.content[0].value
                except:
                    value = entry.get('description') or entry.link

                try:
                    article = Article(site=site,
                                      title=title,
                                      author=author,
                                      src_url=link,
                                      uindex=current_ts(),
                                      content=value)
                    article.save()
                #mark_crawled_url(link)

                except django.db.utils.IntegrityError:
                    logger.info(f'数据重复插入:`{title}`{link}')
                except:
                    logger.warning(f'数据插入异常:`{title}`{link}')

        logger.info('定时更新RSS任务运行结束')
Пример #11
0
def update_subsite_user_feed(global_subsite):

    logger.info('开始更新自定义RSS任务')
    feeds = Site.objects.filter(name__in=global_subsite,status='active', creator='user', star__gte=9).order_by('-star')

    for site in feeds:

        feed_obj=feedparser.parse(site.rss)
        for entry in feed_obj.entries[:10]:
            try:
                title = entry.title
                link = entry.link
            except AttributeError:
                logger.warning(f'必要属性获取失败:`{site.rss}')
                continue

           # if is_crawled_url(link):
           #     continue

            try:
                author = entry['author'][:11]
            except:
                author = None

            try:
                value = entry.content[0].value
            except:
                # to-do fet full descriptions 
                value = entry.get('description') or entry.link

            try:
                article = Article(site=site, title=title, author=author, src_url=link, uindex=current_ts(), content=value)
                article.save()
                #mark_crawled_url(link)

            except django.db.utils.IntegrityError:
                logger.info(f'数据重复插入:`{title}`{link}')
            except:
                logger.warning(f'数据插入异常:`{title}`{link}')
    logger.info('更新自定义RSS任务运行结束')
Пример #12
0
def atom_spider(site):
    """
    更新源内容
    """
    try:
        resp = requests.get(site.rss, timeout=30, verify=False)
    except:
        if site.star >= 9:
            logger.warning(f"RSS源可能失效了`{site.rss}")
        else:
            logger.info(f"RSS源可能失效了`{site.rss}")
        return None

    content = BytesIO(resp.content)
    feed_obj = feedparser.parse(content)

    for entry in feed_obj.entries[:10]:
        try:
            title = entry.title
            link = entry.link
        except AttributeError:
            logger.warning(f'必要属性获取失败:`{site.rss}')
            continue

        if is_crawled_url(link):
            continue

        try:
            author = entry['author'][:20]
        except:
            author = None

        try:
            value = entry.content[0].value
        except:
            value = entry.get('description') or entry.link

        # to absolute image url
        try:
            content_soup = BeautifulSoup(value, "html.parser")

            for img in content_soup.find_all('img'):
                rel_src = img.attrs.get('src')
                abs_src = urllib.parse.urljoin(link, rel_src)
                img.attrs['src'] = abs_src

            value = str(content_soup)
        except:
            logger.warning(f'修复图片路径异常:`{title}`{link}')

        try:
            article = Article(site=site, title=title, author=author, src_url=link, uindex=current_ts(),
                                content=value)
            article.save()
            mark_crawled_url(link)
        except django.db.utils.IntegrityError:
            logger.info(f'数据重复插入:`{title}`{link}')
        except:
            logger.warning(f'数据插入异常:`{title}`{link}')
Пример #13
0
def atom_spider(site):
    """
    更新源内容
    """
    resp = get_with_retry(site.rss)

    if resp is None:
        if site.star > 9:
            guard_log(f"RSS 源可能失效了`{site.rss}")
        else:
            logger.info(f"RSS源可能失效了`{site.rss}")
        return None

    content = BytesIO(resp.content)
    feed_obj = feedparser.parse(content)

    for entry in feed_obj.entries[:12]:
        # 有些是空的
        if not entry:
            continue

        try:
            title = entry.title
            link = entry.link
        except AttributeError:
            logger.warning(f'必要属性获取失败:`{site.rss}')
            continue

        if is_crawled_url(link):
            continue

        try:
            author = entry['author'][:20]
        except:
            author = ''

        try:
            value = entry.content[0].value
        except:
            value = entry.get('description') or entry.link

        # to absolute image url
        try:
            content_soup = BeautifulSoup(value, "html.parser")

            for img in content_soup.find_all('img'):
                rel_src = img.attrs.get('src')
                abs_src = urllib.parse.urljoin(link, rel_src)
                img.attrs['src'] = abs_src

            value = str(content_soup)
        except:
            logger.warning(f'修复图片路径异常:`{title}`{link}')

        # 公众号 RSS 二次抓取
        if get_host_name(site.rss) in ('qnmlgb.tech', ):
            if get_host_name(link) in ('mp.weixin.qq.com', ):
                rsp = get_with_proxy(link)

                if rsp is not None and rsp.ok:
                    try:
                        title, author, value = parse_weixin_page(rsp)
                    except:
                        pass
        try:
            article = Article(site=site,
                              title=title,
                              author=author,
                              src_url=link,
                              uindex=current_ts(),
                              content=value)
            article.save()

            mark_crawled_url(link)
        except django.db.utils.IntegrityError:
            logger.info(f'数据重复插入:`{title}`{link}')
        except:
            logger.warning(f'数据插入异常:`{title}`{link}')

    set_updated_site(site.pk)
    return True