def parse_detail(self, response): try: query = urlparse.urlparse(response.url).query params = urlparse.parse_qs(query) fd = params["fd"][0].decode("gbk") sd = params["sd"][0].decode("gbk") except: fd = None sd = None traceback.print_exc() contents = response.css(".sub_dir_box").xpath("//table//td") nav = response.css("#album_list_page").css(".pagination") for a in contents: title = a.xpath("a/text()").extract() link = a.xpath("a/@href").extract() il = ConvertItemLoader(CatalogItem()) il.add_value(u"name", title) il.add_value(u"fd", fd) il.add_value(u"sd", sd) il.add_value(u"url", link) il.load_item() yield il.item links = nav.xpath("a/text()").extract() if u'\u4e0b\u4e00\u9875' in links: idx = links.index(u'\u4e0b\u4e00\u9875') np = "http://tieba.baidu.com" + nav.xpath("a/@href").extract()[idx] yield Request(np, callback = self.parse)
def parse(self, response): if self.need_insert_catalog: self.need_insert_catalog = False il = ConvertItemLoader(CatalogItem()) il.add_value(u"name", self.subject) il.add_value(u"url", ensure_unicode(response.url)) try: info = response.css("#forumInfoPanel").css(".forum_dir_info").xpath("li/a/@href").extract()[-1] query = urlparse.urlparse(info).query params = urlparse.parse_qs(query) il.add_value(u"fd", params["fd"][0].encode("raw_unicode_escape").decode("utf8")) il.add_value(u"sd", params["sd"][0].encode("raw_unicode_escape").decode("utf8")) il.load_item() yield il.item except: traceback.print_exc() contents = response.css("#thread_list").xpath('//li[contains(@data-field, "id")]') for li in contents: data = li.xpath("@data-field").extract() try: data_dict = json.loads(data[0]) data_dict[u"post_id"] = data_dict.pop("id") pid = data_dict["post_id"] replynum = data_dict["reply_num"] is_top = data_dict["is_top"] sql_replynum = self.crawler.sqlmanager.get_post_replynum(pid) if sql_replynum is None or replynum > sql_replynum: text = li.css(".threadlist_title").xpath("a/@title").extract() data_dict[u"title"] = text[0] data_dict[u"subject"] = self.subject page = 1 if sql_replynum is None else sql_replynum / 30 + 1 post_url = "http://tieba.baidu.com/p/%d?pn=%d" % (pid, page) item = PostListItem() for key in item.fields.keys(): if data_dict.has_key(key): if isinstance(data_dict[key], unicode): item[key] = data_dict[key].encode("utf-8") else: item[key] = data_dict[key] request = Request(post_url, self.parse_post) request.meta["pitem"] = item yield request elif not is_top: self.refresh_postlist = False break except: print "-----------parse except------------" self.refresh_postlist = False traceback.print_exc() if self.refresh_postlist: nav = response.css(".pager").xpath("a[contains(text(), '>')]") if nav: np = "http://tieba.baidu.com" + nav.xpath("@href").extract()[0] yield Request(np, self.parse)