Пример #1
0
    def parse_member(self, response, follow=True):
        match_obj = re.match(".*?zhiyou.smzdm.com/member/(\d+)/.*?",
                             response.url)
        if match_obj:
            member_id = int(match_obj.group(1))
        item_loader = SmzdmspiderItemLoader(item=MemberItem(),
                                            response=response)
        item_loader.add_value("member_id", member_id)
        item_loader.add_css("member_name", '.info-stuff-nickname a::text')
        item_loader.add_css("info_words", '.info-stuff-words::text')
        item_loader.add_css("yuanchuang", '.yuanchuang a::text')
        item_loader.add_css("wiki", '.wiki a::text')
        item_loader.add_css("baoliao", '.baoliao a::text')
        item_loader.add_css("pingce", '.pingce a::text')
        item_loader.add_css("qingdan", '.qingdan a::text')
        item_loader.add_css("comment", '.comment a::text')
        item_loader.add_css("second", '.second a::text')
        item_loader.add_css("focus", '.user-focus span::text')
        item_loader.add_css("fans", '.user-fans span::text')

        member_item = item_loader.load_item()
        yield member_item
Пример #2
0
    def parse_article(self, response, follow=True):
        # 获取爆料内容
        match_obj = re.match(".*?www.smzdm.com/p/(\d+)/.*?", response.url)
        if match_obj:
            article_id = int(match_obj.group(1))

        item_loader = SmzdmspiderItemLoader(item=SmzdmArticleItem(),
                                            response=response)
        item_loader.add_value("article_id", article_id)
        item_loader.add_css("article_channel", '#article_channel::attr(value)')
        item_loader.add_css("article_title",
                            '.article_title  em[itemprop="name"]::text')
        item_loader.add_value("article_url", response.url)
        if response.css('.ellipsis.author'):
            if response.css('.ellipsis.author > a::text'):
                ellipsis_author = response.css(
                    '.ellipsis.author > a::text').extract_first("None")
                ellipsis_author_id = response.css(
                    '.ellipsis.author > a::attr(href)').extract_first("None")
            else:
                ellipsis_author = "商家自荐"
                ellipsis_author_id = "商家自荐"
        else:
            ellipsis_author = "None"
            ellipsis_author_id = "None"
        item_loader.add_value("ellipsis_author", ellipsis_author)
        item_loader.add_value("ellipsis_author_id", ellipsis_author_id)
        item_loader.add_css("update_time",
                            '.article_meta > span:last-child::text')
        price = response.css('em[itemprop="price"]::text').extract_first("0")
        item_loader.add_value("price", price)
        item_loader.add_css("price_currency",
                            'meta[itemprop="priceCurrency"]::attr(content)')
        item_loader.add_css("price_detail",
                            'em[itemprop="offers"] span.red::text')
        item_loader.add_css("buy_url", '.buy a::attr(href)')
        # item_loader.add_css("content", '.item-preferential')
        item_loader.add_css("fav_num", 'div.leftLayer > a.fav em::text')
        item_loader.add_css("comment_num",
                            'div.leftLayer > a.comment em::text')
        item_loader.add_css("rating_all_num", '#rating_all_num em::text')
        item_loader.add_css("rating_worthy_num", '#rating_worthy_num::text')
        item_loader.add_css("rating_unworthy_num",
                            '#rating_unworthy_num::text')

        article_item = item_loader.load_item()
        yield article_item

        item_loader2 = SmzdmspiderItemLoader(item=SmzdmArticleContentItem(),
                                             response=response)
        item_loader2.add_value("article_id", article_id)
        item_loader2.add_css("content", '.item-preferential')
        article_content = item_loader2.load_item()
        yield article_content

        # tags = response.css('span.tags div::text').extract()
        # tags = [tag.strip() for tag in tags if tag.strip()]
        # tags = ','.join(tags)

        tags = response.css('.meta-tags')
        for tag in tags:
            tag_item = ArticleTagItem()
            tag_url = tag.css('a::attr(href)').extract_first("")
            tag_detail = tag.css('a::text').extract_first("")
            tag_sort = tag.css('div div::text').extract_first("").split(
                u':')[0] if tag.css('div div') else "暂无分类"

            tag_item["article_id"] = article_id
            tag_item["article_url"] = response.url
            tag_item["tag_sort"] = tag_sort
            tag_item["tag_detail"] = tag_detail
            yield tag_item
Пример #3
0
    def parse_comment(self, response, follow=True):
        comments = response.css(
            "div#commentTabBlockNew ul.comment_listBox li.comment_list")
        match_obj = re.match(".*?www.smzdm.com/p/(\d+)/.*?", response.url)
        if match_obj:
            article_id = int(match_obj.group(1))
        for comment in comments:
            grey = comment.css('span::text').extract_first("")
            usmzdmid = comment.css(
                'a.a_underline::attr(usmzdmid)').extract_first("")
            author = comment.css(
                'span[itemprop="author"]::text').extract_first("")
            rank = comment.css('div.rank::attr(title)').extract_first("")
            comment_con = comment.css('div.comment_conWrap')[-1].css(
                'div.comment_con span::text').extract_first("")
            time = comment.css('.time::text').extract_first("")
            come_from = comment.css('.come_from a::text').extract_first(" ")
            dingnum = comment.css(
                'div.comment_action a.dingNum span::text').extract_first("")
            cainum = comment.css(
                'div.comment_action a.caiNum span::text').extract_first("")

            # print grey, usmzdmid, author, rank, dingnum, cainum, comment_con
            item_loader = SmzdmspiderItemLoader(item=CommentItem(),
                                                response=response)
            item_loader.add_value("article_id", article_id)
            item_loader.add_value("article_url", response.url)
            item_loader.add_value("grey", grey)
            item_loader.add_value("usmzdmid", usmzdmid)
            item_loader.add_value("author", author)
            item_loader.add_value("rank", rank)
            item_loader.add_value("comment_time", time)
            item_loader.add_value("comment_con", comment_con)
            item_loader.add_value("come_from", come_from)
            item_loader.add_value("dingnum", dingnum)
            item_loader.add_value("cainum", cainum)

            comment_item = item_loader.load_item()
            yield comment_item