예제 #1
0
파일: parsers.py 프로젝트: playbar/spider
    def __init__(self, response):
        ParentParser.__init__(self, response)
        self.next_page = False
        self.item = AlbumItem()

        self.get_from_url()
        if self.next_page: return

        self.get_album_name()
        self.get_author()
        self.get_recommend_total()
        self.get_like_total()
        self.get_tags()

        self.parse_short_info()
        self.get_create_date()
        self.get_photo_count()
        self.get_follow_count()
        self.get_desc()
예제 #2
0
    def __init__(self, response):
        ParentParser.__init__(self, response)
        self.next_page = False
        self.item = AlbumItem()

        self.get_from_url()
        if self.next_page: return

        self.get_album_name()
        self.get_author()
        self.get_recommend_total()
        self.get_like_total()
        self.get_tags()

        self.parse_short_info()
        self.get_create_date()
        self.get_photo_count()
        self.get_follow_count()
        self.get_desc()
예제 #3
0
class AlbumParser(ParentParser):
    def __init__(self, response):
        ParentParser.__init__(self, response)
        self.next_page = False
        self.item = AlbumItem()

        self.get_from_url()
        if self.next_page: return

        self.get_album_name()
        self.get_author()
        self.get_recommend_total()
        self.get_like_total()
        self.get_tags()

        self.parse_short_info()
        self.get_create_date()
        self.get_photo_count()
        self.get_follow_count()
        self.get_desc()

    def get_from_url(self):
        url = self.response.url.split("?", 1)
        if len(url) > 1: self.next_page = True
        self.item["from_url"] = url[0]

    def get_album_name(self):
        x_album_name = self.response.xpath("//h1/text()").extract()[0].split("-", 1)
        if len(x_album_name) == 2:
            self.item["album_name"] = x_album_name[1]
            author = self.item.setdefault("author", {})
            author["nickname"] = x_album_name[0].replace(u"的相册", "")

    def get_author(self):
        x_author = self.response.xpath("//div[@id='db-usr-profile']/div[@class='pic']/a")
        if x_author:
            author = self.item.setdefault("author", {})
            author["home_page"] = x_author.xpath("@href").extract()[0]
            author["avatar"] = x_author.xpath("img/@src").extract()[0]

    def get_recommend_total(self):
        x_recommend_total = self.response.xpath("//span[@class='rec-num']").re("\d+")
        if x_recommend_total: self.item["recommend_total"] = int(x_recommend_total[0])

    def get_like_total(self):
        x_like_total = self.response.xpath("//span[@class='fav-num']/a/text()").re("\d+")
        if x_like_total: self.item["like_total"] = int(x_like_total[0])

    def get_tags(self):
        x_tags = self.response.xpath("//div[@class='footer-tags']/a/text()").extract()
        if x_tags: self.item["tags"] = x_tags

    def parse_short_info(self):
        self.short_info = "".join(self.response.xpath("//div[@class='wr']//text()").extract())

    def get_create_date(self):
        M = CREATE_DATE_RE.search(self.short_info)
        if M is not None: self.item["create_date"] = M.group(0)

    def get_photo_count(self):
        M = PHOTO_COUNT_RE.search(self.short_info)
        if M is not None: self.item["photo_count"] = int(M.group(1))

    def get_follow_count(self):
        M = FOLLOW_RE.search(self.short_info)
        if M is not None: self.item["follow_count"] = int(M.group(1))

    def get_desc(self):
        x_desc = self.response.xpath("//div[@id='link-report']/text()").extract()
        if x_desc: self.item["desc"] = x_desc[0]
예제 #4
0
파일: parsers.py 프로젝트: playbar/spider
class AlbumParser(ParentParser):
    def __init__(self, response):
        ParentParser.__init__(self, response)
        self.next_page = False
        self.item = AlbumItem()

        self.get_from_url()
        if self.next_page: return

        self.get_album_name()
        self.get_author()
        self.get_recommend_total()
        self.get_like_total()
        self.get_tags()

        self.parse_short_info()
        self.get_create_date()
        self.get_photo_count()
        self.get_follow_count()
        self.get_desc()

    def get_from_url(self):
        url = self.response.url.split("?", 1)
        if len(url) > 1: self.next_page = True
        self.item["from_url"] = url[0]

    def get_album_name(self):
        x_album_name = self.response.xpath("//h1/text()").extract()[0].split(
            "-", 1)
        if len(x_album_name) == 2:
            self.item["album_name"] = x_album_name[1]
            author = self.item.setdefault("author", {})
            author["nickname"] = x_album_name[0].replace(u"的相册", "")

    def get_author(self):
        x_author = self.response.xpath(
            "//div[@id='db-usr-profile']/div[@class='pic']/a")
        if x_author:
            author = self.item.setdefault("author", {})
            author["home_page"] = x_author.xpath("@href").extract()[0]
            author["avatar"] = x_author.xpath("img/@src").extract()[0]

    def get_recommend_total(self):
        x_recommend_total = self.response.xpath("//span[@class='rec-num']").re(
            "\d+")
        if x_recommend_total:
            self.item["recommend_total"] = int(x_recommend_total[0])

    def get_like_total(self):
        x_like_total = self.response.xpath(
            "//span[@class='fav-num']/a/text()").re("\d+")
        if x_like_total: self.item["like_total"] = int(x_like_total[0])

    def get_tags(self):
        x_tags = self.response.xpath(
            "//div[@class='footer-tags']/a/text()").extract()
        if x_tags: self.item["tags"] = x_tags

    def parse_short_info(self):
        self.short_info = "".join(
            self.response.xpath("//div[@class='wr']//text()").extract())

    def get_create_date(self):
        M = CREATE_DATE_RE.search(self.short_info)
        if M is not None: self.item["create_date"] = M.group(0)

    def get_photo_count(self):
        M = PHOTO_COUNT_RE.search(self.short_info)
        if M is not None: self.item["photo_count"] = int(M.group(1))

    def get_follow_count(self):
        M = FOLLOW_RE.search(self.short_info)
        if M is not None: self.item["follow_count"] = int(M.group(1))

    def get_desc(self):
        x_desc = self.response.xpath(
            "//div[@id='link-report']/text()").extract()
        if x_desc: self.item["desc"] = x_desc[0]