def parse(self, response): logger.info('[%s] %s' % (datetime.date.today(), response.url)) hxs = scrapy.Selector(response) sites = hxs.xpath('//div[@class="menu-wrapper"]/ul/li[@class="m-i "]/a[@class="i-link"]') for s in sites: tag = s.xpath('em/text()').extract()[0] url = s.xpath('@href').extract()[0] request = Request(url=urljoin_rfc(get_base_url(response), url), callback=self.tagParse, meta={'tag': tag}) yield request
def videoParse(self, response): logger.info('[%s] %s' % (datetime.date.today(), response.url)) hxs = scrapy.Selector(response) video = hxs.xpath('//meta[@itemprop="embedURL"]/@content').extract()[0] item = BilibiliItem() item['title'] = response.meta['title'] item['desc'] = response.meta['desc'] item['tag'] = response.meta['tag'] item['picurl'] = response.meta['image'] item['videourl'] = video yield item
def tagParse(self, response): logger.info('[%s] %s' % (datetime.date.today(), response.url)) hxs = scrapy.Selector(response) sites = hxs.xpath('//div[@class="b-body"]/ul[@class="vidbox v-list sub"]/li') tag = response.meta['tag'] for s in sites: title = s.xpath('div/a/@title').extract()[0] desc = s.xpath('div/@txt').extract()[0] image = s.xpath('div/a/div/img/@src').extract()[0] url = s.xpath('div/a/@href').extract()[0] request = Request(url=urljoin_rfc(get_base_url(response), url), callback=self.videoParse, meta={"tag": tag, "title": title, "desc": desc, "image": image}) yield request
def parse(self, response): logger.info('[%s] %s' % (datetime.date.today(), response.url)) jsoncon = json.loads(response.body) if jsoncon['message'] == 'success': conlist = jsoncon['data'] tag = {v.split('=')[0]: v.split('=')[1] for v in urlparse(response.url).query.split('&')}.get('category', '__all__') for con in conlist: if con['middle_mode']: item = WwwToutiaoComItem() item['title'] = con['title'].encode('utf-8') item['tag'] = tag item['desc'] = con['abstract'].encode('utf-8') item['image'] = con['middle_image'] item['url'] = con['url'] item['createtime'] = con['create_time'] yield item
def parse(self, response): logger.info('[%s] %s' % (datetime.date.today(), response.url)) jsoncon = json.loads(response.body) if jsoncon['message'] == 'success': conlist = jsoncon['data'] tag = { v.split('=')[0]: v.split('=')[1] for v in urlparse(response.url).query.split('&') }.get('category', '__all__') for con in conlist: if con['middle_mode']: item = WwwToutiaoComItem() item['title'] = con['title'].encode('utf-8') item['tag'] = tag item['desc'] = con['abstract'].encode('utf-8') item['image'] = con['middle_image'] item['url'] = con['url'] item['createtime'] = con['create_time'] yield item
def parse(self, response): logger.info('[%s] %s' % (datetime.date.today(), response.url)) hxs = HtmlXPathSelector(response) l = [ # 视觉大片 { 'id': 'SI_Scroll_2_Cont', 'cl': 'photograph_gallery' }, # 八卦 { 'id': 'SI_Scroll_3_Cont', 'cl': 'gossip' }, # 服饰搭配 { 'id': 'SI_Scroll_4_Cont', 'cl': 'style' }, # 美体瘦身 { 'id': 'SI_Scroll_5_Cont', 'cl': 'body' }, # 彩妆美发 { 'id': 'SI_Scroll_6_Cont', 'cl': 'beauty' }, ] for d in l: sites = hxs.select('//div[@id="%s"]/div/div/a/@href' % d['id']).extract() for site in sites: cl = d['cl'] request = Request( site, callback=self.deepParse, meta={'cl': cl}, ) yield request