示例#1
0
class MyItem(Item):
    """
    定义爬虫的目标字段
    """
    target_item = HtmlField(css_select='html')
    title = TextField(css_select='head title')
    article = HtmlField(css_select='article')
示例#2
0
def test_html_field_with_many():
    field = HtmlField(css_select="a.test_link", many=True)
    values = field.extract(html_etree=html_etree)
    assert len(values) == 5
    assert values[0] == '<a class="test_link" href="https://github.com/howie6879/">hello1 github.</a>\n'
    assert values[4] == '<a class="test_link" href="https://github.com/howie6879/">hello5 github.</a>\n' \
                        '    Some text outside.\n'
示例#3
0
def test_html_field():
    field_en = HtmlField(css_select="div.brand a")
    field_zh = HtmlField(css_select="div.brand p")
    assert (
        field_en.extract(html_etree=html_etree)
        == '<a href="https://github.com">Github</a>'
    )
    assert field_zh.extract(html_etree=html_etree) == "<p>你好</p>\n"
示例#4
0
class RankingItem(Item):
    target_item = TextField(css_select='.rank-list')
    ranking_title = TextField(css_select='h3.wrap-title')
    more = AttrField(css_select='h3>a.more', attr='href')
    book_list = HtmlField(css_select='div.book-list>ul>li', many=True)

    async def clean_ranking_title(self, ranking_title):
        if isinstance(ranking_title, list):
            return ranking_title[0].text
        else:
            return str(ranking_title).split('榜')[0] + '榜'

    async def clean_more(self, more):
        return "https:" + more
示例#5
0
class ChinaNewsItem(Item):
    """
    定义目标字段抓取规则
    """
    #target_item =
    title = TextField(css_select='h1')
    content = HtmlField(css_select='div.left_zw')

    async def clean_title(self, value):
        """
        清洗目标数据
        :param value: 初始目标数据
        :return:
        """
        return value
示例#6
0
def test_html_field():
    field = HtmlField(css_select="div.brand a")
    assert (field.extract(
        html_etree=html_etree) == '<a href="https://github.com">Github</a>')
示例#7
0
class RankingItem(Item):
    target_item = TextField(css_select='div.rank_i_p_list')
    ranking_title = TextField(css_select='div.rank_i_p_tit')
    more = AttrField(css_select='div.rank_i_more a', attr='href')
    book_list = HtmlField(css_select='div.rank_i_p_list>div.rank_i_li',
                          many=True)
示例#8
0
class WechatItem(Item):
    """
    基于 Ruia 的微信页面 Item 提取类
    示例:https://mp.weixin.qq.com/s/NKnTiLixjB9h8fSd7Gq8lw
    """

    # 文章标题
    # doc_name = AttrField(css_select='meta[property="og:title"]', attr="content")
    doc_name = AttrField(css_select='meta[property="og:title"]',
                         attr="content",
                         default="")
    # 描述
    doc_des = AttrField(css_select='meta[property="og:description"]',
                        attr="content",
                        default="")
    # 文章作者
    doc_author = AttrField(css_select='meta[property="og:article:author"]',
                           attr="content",
                           default="")
    # 文章链接,这里的链接有过期时间,但是在微信体系内打开并不会过期,所以可以用
    doc_link = AttrField(css_select='meta[property="og:url"]',
                         attr="content",
                         default="")
    # 文章类型
    doc_type = AttrField(css_select='meta[property="og:type"]',
                         attr="content",
                         default="")
    # 文章发布时间戳
    doc_ts = RegexField(
        re_select=r"var ct = \"(\d{1,10})\"\;",
        default=time.time(),
    )
    # 文章发布日期
    doc_date = RegexField(
        re_select=r"var ct = \"(\d{1,10})\"\;",
        default=ts_to_str_date(time.time()),
    )
    # doc_date_f1 = TextField(css_select="em#publish_time", default="")
    # doc_date_f2 = RegexField(
    #     re_select=r"o=\"(20\d.*)\"\;",
    #     default=ts_to_str_date(time.time(), "%Y-%m-%d %H:%M"),
    # )
    # doc_ts_f1 = TextField(css_select="em#publish_time", default="")
    # doc_ts_f2 = RegexField(
    #     re_select=r"o=\"(20\d.*)\"\;",
    #     default=ts_to_str_date(time.time(), "%Y-%m-%d %H:%M"),
    # )
    # 文章图
    doc_image = AttrField(css_select='meta[property="og:image"]',
                          attr="content",
                          default="")
    # 公众号名称
    doc_source_name = TextField(
        css_select="div.profile_inner>strong.profile_nickname", default="")
    # 公众号元数据
    doc_source_meta_list = TextField(
        css_select="p.profile_meta>span.profile_meta_value",
        many=True,
        default=["", ""])
    # 核心html
    doc_core_html = HtmlField(css_select="div#js_content", default="")
    # 公众号昵称
    doc_source_account_nick = ""
    # 公众号介绍
    doc_source_account_intro = ""
    # 文本内容,兼容
    doc_content = ""
    # 常量
    # 信息来源
    doc_source = "liuli_wechat"

    async def clean_doc_source_meta_list(self, value: list):
        """从doc_source_meta_list提取公众号昵称和介绍"""
        self.doc_source_account_nick = value[0]
        self.doc_source_account_intro = value[1]
        return value

    async def clean_doc_core_html(self, value: str):
        """清洗核心html"""

        return text_compress(
            str(value).strip().replace("visibility: visible;", "").replace(
                "<br>", "").replace("data-src", "src"))

    async def clean_doc_date(self, value):
        """
        清洗时间,数据格式 2021-12-17 08:48
        """
        try:
            value = ts_to_str_date(value)
        except Exception as _:
            value = ts_to_str_date(time.time())
        return value

    async def clean_doc_ts(self, value):
        """
        清洗时间戳,数据格式1620567960
        """
        try:
            value = int(value)
        except Exception as _:
            value = int(time.time())
        return value
示例#9
0
class HackerNewsItem(Item):
    target_item = TextField(css_select='tr.athing')
    title = TextField(css_select='a.storylink')
    url = AttrField(css_select='a.storylink', attr='href')
    content = HtmlField(css_select='a.storylink')