Пример #1
0
    def parse_detail(self, response):
        try:
            query = urlparse.urlparse(response.url).query
            params = urlparse.parse_qs(query)
            fd = params["fd"][0].decode("gbk")
            sd = params["sd"][0].decode("gbk")
        except:
            fd = None
            sd = None
            traceback.print_exc()

        contents = response.css(".sub_dir_box").xpath("//table//td")
        nav = response.css("#album_list_page").css(".pagination")
        for a in contents:
            title = a.xpath("a/text()").extract()
            link = a.xpath("a/@href").extract()
            il = ConvertItemLoader(CatalogItem())
            il.add_value(u"name", title)
            il.add_value(u"fd", fd)
            il.add_value(u"sd", sd)
            il.add_value(u"url", link)
            il.load_item()
            yield il.item

        links = nav.xpath("a/text()").extract()
        if u'\u4e0b\u4e00\u9875' in links:
            idx = links.index(u'\u4e0b\u4e00\u9875')
            np = "http://tieba.baidu.com" + nav.xpath("a/@href").extract()[idx]
            yield Request(np, callback = self.parse)
Пример #2
0
    def parse(self, response):
        if self.need_insert_catalog:
            self.need_insert_catalog = False
            il = ConvertItemLoader(CatalogItem())
            il.add_value(u"name", self.subject)
            il.add_value(u"url", ensure_unicode(response.url))
            try:
                info = response.css("#forumInfoPanel").css(".forum_dir_info").xpath("li/a/@href").extract()[-1]
                query = urlparse.urlparse(info).query
                params = urlparse.parse_qs(query)
                il.add_value(u"fd", params["fd"][0].encode("raw_unicode_escape").decode("utf8"))
                il.add_value(u"sd", params["sd"][0].encode("raw_unicode_escape").decode("utf8"))
                il.load_item()
                yield il.item
            except:
                traceback.print_exc()
        contents = response.css("#thread_list").xpath('//li[contains(@data-field, "id")]')
        for li in contents:
            data = li.xpath("@data-field").extract()
            try:
                data_dict = json.loads(data[0])
                data_dict[u"post_id"] = data_dict.pop("id")
                pid = data_dict["post_id"]
                replynum = data_dict["reply_num"]
                is_top = data_dict["is_top"]

                sql_replynum = self.crawler.sqlmanager.get_post_replynum(pid)
                if sql_replynum is None or replynum > sql_replynum:
                    text = li.css(".threadlist_title").xpath("a/@title").extract()
                    data_dict[u"title"] = text[0]
                    data_dict[u"subject"] = self.subject
                    page = 1 if sql_replynum is None else sql_replynum / 30 + 1
                    post_url = "http://tieba.baidu.com/p/%d?pn=%d" % (pid, page)

                    item = PostListItem()
                    for key in item.fields.keys():
                        if data_dict.has_key(key):
                            if isinstance(data_dict[key], unicode):
                                item[key] = data_dict[key].encode("utf-8")
                            else:
                                item[key] = data_dict[key]
                    request = Request(post_url, self.parse_post)
                    request.meta["pitem"] = item
                    yield request
                elif not is_top:
                    self.refresh_postlist = False
                    break
            except:
                print "-----------parse except------------"
                self.refresh_postlist = False
                traceback.print_exc()

        if self.refresh_postlist:
            nav = response.css(".pager").xpath("a[contains(text(), '>')]")
            if nav:
                np = "http://tieba.baidu.com" + nav.xpath("@href").extract()[0]
                yield Request(np, self.parse)