def parse_tags(self, response): """解析标签页""" # self.queue.put(response.meta.get("browser")) tag_item_loader = DefaultItemLoader(item=TagItem(), response=response) tag_item_loader.add_xpath("name", "//div[@class='top-text']/text()") tag_item_loader.add_xpath("likes", "//div[@class='concern-num']/text()") tag_item_loader.add_xpath("publish_nums", "//div[@class='pageInfo']/span[2]/text()") tag_item = tag_item_loader.load_item() yield tag_item
def parse_person(self, response): # self.queue.put(response.meta.get("browser")) person_item_loader = DefaultItemLoader(PersonItem(), response=response) person_item_loader.add_xpath("name", "//span[@id='h-name']/text()") person_item_loader.add_xpath("gender", "//span[@id='h-gender']/@class") person_item_loader.add_xpath("sign", "//*[@class='h-sign']/text()") person_item_loader.add_xpath("level", "//a[contains(@class, 'h-level')]/@lvl") person_item_loader.add_xpath("avatar", "//img[@id='h-avatar']/@src") person_item_loader.add_xpath( "uid", "//div[contains(@class, 'uid')]/span[@class='text']/text()") person_item_loader.add_xpath( "birthday", "//div[contains(@class, 'birthday')]/span[@class='text']/text()") person_item_loader.add_xpath("attention_nums", "//a[contains(@class, 'n-gz')]/@title") person_item_loader.add_xpath("fans_nums", "//a[contains(@class, 'n-fs')]/@title") person_item_loader.add_xpath("play_nums", "//div[contains(@class, 'n-bf')]/@title") person_item_loader.add_xpath( "register_time", "//div[contains(@class, 'regtime')]/span[@class='text']/text()") person_item_loader.add_xpath( "member_level", "//a[contains(@class, 'h-vipType')]/@class") person_item_loader.add_xpath( "play_game_list", "//div[contains(@class, 'game')]//div[@class='detail']/text()") person_item_loader.add_xpath( "tags", "//div[contains(@class, 'tag-list')]/a/text()") person_item = person_item_loader.load_item() yield person_item
def parse_article(self, response): article_item_loader = DefaultItemLoader(ArticleItem(), response=response) article_item_loader.add_xpath("author", "//a[@class='up-name']/text()") article_item_loader.add_xpath( "cover_img_url", "//div[@class='banner-img-holder']/@style") article_item_loader.add_xpath("title", "//h1[@class='title']/text()") article_item_loader.add_xpath( "desc", "//div[contains(@class, 'article-holder')]/p/text()") article_item_loader.add_value("url", response.url) article_item_loader.add_value("cid", get_id(response.url)) article_item_loader.add_xpath( "img_box", "//figure[@class='img-box']/img/@data-src") article_item_loader.add_xpath( "views", "//div[@class='article-data']/span[1]/text()") article_item_loader.add_xpath( "likes", "//div[@class='article-data']/span[2]/text()") article_item_loader.add_xpath( "comments", "//div[@class='article-data']/span[3]/text()") article_item_loader.add_xpath( "coins", "//div[@class='coin-btn']/div/span/text()") article_item_loader.add_xpath( "collections", "//div[@class='fav-btn']/div/span/text()") article_item_loader.add_xpath( "shares", "//div[@class='share-btn']/div/span/text()") article_item_loader.add_xpath("publish_time", "//span[@class='create-time']/text()") article_item_loader.add_xpath( "category", "//a[@class='category-link']/span/text()") article_item_loader.add_xpath( "tags", "//li[@class='tag-item']/span[2]/text()") article_item = article_item_loader.load_item() yield article_item if PARSE_COMMENTS: for item in self.gen_comments_item(response): yield item
def parse_detail(self, response): """解析Video""" detail_loader = DefaultItemLoader(VideoItem(), response=response) detail_loader.add_value("vid", get_id(response.url)) detail_loader.add_xpath( "author", "//div[contains(@class, 'user')]/a[contains(@class, 'name')]/text()" ) detail_loader.add_xpath("title", "//h1/@title") detail_loader.add_xpath( "desc", "//div[@id='v_desc']/div[contains(@class, 'info')]") detail_loader.add_value("url", response.url) detail_loader.add_xpath( 'play_nums', "substring(//span[contains(@class, 'v play')]/@title, 5)") detail_loader.add_xpath( 'danmu_nums', "substring(//span[contains(@class, 'v dm')]/@title, 5)") detail_loader.add_xpath( 'coins', "substring(//span[@report-id='coinbtn1']/@title, 6)") detail_loader.add_xpath( 'collections', "substring(//span[@report-id='collect1']/@title, 5)") detail_loader.add_value('comments', response.meta.get('comments', 0)) detail_loader.add_xpath( 'shares', "//div[@id='playpage_share']//span[@class='num']/text()") detail_loader.add_value('likes', response.meta.get("likes", 0)) detail_loader.add_xpath("publish_time", "//time/text()") detail_loader.add_xpath( "category", "//div[contains(@class, 'tminfo')]/span/a/text()") detail_loader.add_xpath( "tags", "//ul[contains(@class, 'tag-area')]/li/a/text()") detail_item = detail_loader.load_item() self.crawler.stats.inc_value("detail_item") yield detail_item if PARSE_COMMENTS: for item in self.gen_comments_item(response): yield item