def parse_item(self, response): """TODO: Docstring for parse_item. :response: TODO :returns: TODO """ page = Selector(response) item = AppstoreItem() item['title'] = page.xpath('//ul[@class="app-info-ul nofloat"]/li/p/span[@class="title"]/text()'). \ extract_first().encode('utf-8') item['url'] = response.url appid = re.match(r'http://.*/(.*)', item['url']).group(1) item['appid'] = appid item['intro'] = page.xpath('//meta[@name="description"]/@content' ).extract_first().encode('utf-8') divs = page.xpath('//div[@class="open-info"]') recomm = "" for div in divs: url = div.xpath('./p[@class="name"]/a/@href').extract_first() recommended_appid = re.match(r'http://.*/(.*)', url).group(1) name = div.xpath( './p[@class="name"]/a/text()').extract_first().encode('utf-8') recomm += "{0}:{1},".format(recommended_appid, name) item['recommended'] = recomm yield item
def parse_item(response): page = Selector(response) item = AppstoreItem() item['title'] = page.xpath( './/ul[@class="app-info-ul nofloat"]/li/p/span[@class="title"]/text()' ).extract_first().encode('utf-8') item['url'] = response.url #print type(item['url']) item['appid'] = re.match(r'http://.*/(.*)', item['url']).group(1) item['intro'] = page.xpath('//meta[@name="description"]/@content' ).extract_first().encode('utf-8') item['thumbnailurl'] = page.xpath( '//ul[@class="app-info-ul nofloat"]/li[@class="img"]/img[@class="app-ico"]/@lazyload' ).extract_first() divs = page.xpath('//div[@class="open-info"]') recomm = "" for div in divs: url = div.xpath('./p[class="name"]/a/@href').extract_first() #print type(url) recommended_appid = re.match(r'http://.*/(.*)', url).group(1) name = div.xpath( './p[@class="name"]/a/text()').extract_first().encode('utf-8') recomm += "{0}:{1},".format(recommended_appid, name) item['recommended'] = recomm yield item
def parse_item(self, response): page = Selector(response) item = AppstoreItem() # print "page:%s"%page.get() item['title'] = page.xpath('//ul[@class="app-info-ul nofloat"]/li/p/span[@class="title"]/text()').\ extract_first().encode('utf-8') item['url'] = response.url print item['url'] item['appid'] = page.xpath( '//input[@id="appId"]/@value').extract_first().encode('utf-8') # item['appid'] = re.search('C\d*',item['url']).group() item['intro'] = page.xpath('//meta[@name="description"]/@content' ).extract_first().encode('utf-8') divs = page.xpath('//div[@class="open-info"]') recomm = "" #len(divs)==20, 10 for recommended and 10 for same type top10 for div in divs[:10]: url = div.xpath('./p[@class="name"]/a/@href').extract_first() recom_appid = re.search('C\d*', url).group() name = div.xpath( './p[@class="name"]/a/@title').extract_first().encode('utf-8') recomm += "{0}:{1},".format(recom_appid, name) item['recommended'] = recomm yield item
def parse(self, response): """ response.body is a result of render.html call; it contains HTML processed by a browser. here we parse the html :param response: :return: request to detail page & request to next page if exists """ page = Selector(response) divs = page.xpath('//div[@class="list-game-app dotline-btn nofloat"]') current_url = response.url # parse details count = 0 for div in divs: if count >= 2: break item = AppstoreItem() info = div.xpath('.//div[@class="game-info whole"]') detail_url = info.xpath( './h4[@class="title"]/a/@href').extract_first() item["url"] = detail_url req = Request(detail_url, callback=self.parse_detail_page) req.meta["item"] = item count += 1 yield req # next page '''
def parse(self, response): page = Selector(response) divs = page.xpath('//ul[@class="applist"]/li') for div in divs: item = AppstoreItem() item['title'] = div.xpath('./h5/a/text()').extract_first().encode( 'utf-8') item['url'] = div.xpath('./h5/a/@href').extract_first() appid = re.match(r'/detail/(.*)', item['url']).group(1) item['appid'] = appid item['intro'] = div.xpath('.//p[@class="app-desc"]/a/text()'). \ extract_first().encode('utf-8') yield item
def parse(self, response): page = Selector(response) divs = page.xpath('//div[@class="game-info whole"]') for div in divs: item = AppstoreItem() item['title'] = div.xpath('.//h4[@class="title"]/a/text()'). \ extract_first().encode('utf-8') item['url'] = div.xpath( './/h4[@class="title"]/a/@href').extract_first() appid = re.match(r'http://.*/(.*)', item['url']).group(1) item['appid'] = appid item['intro'] = div.xpath('.//p[@class="content"]/text()'). \ extract_first().encode('utf-8') yield item
def parse(self, response): sel = Selector(response) toplist = [] apps = sel.xpath('//ul[@class="ranklist"]/li') for app in apps: item = AppstoreItem() item['rank'] = app.xpath('div/h3/span/text()').extract()[0] item['name'] = app.xpath('div/h3/a[@class="hd"]/text()').extract()[0] item['category'] = app.xpath('div/div[@class="intro"]/a[@class="intro-category"]/text()').extract()[0] item['size'] = app.xpath('div/div[@class="intro"]/p/text()').extract()[0] #process size so that the "大小" character is removed item['size'] = item['size'].split(u'\uff1a')[1] toplist.append(item) return toplist
def parse_item(self, response): #print(response.url) selected = Selector(response=response).xpath( '//div[contains(@class, "c-group f-wrap-items context-list-page")]' ) sections = selected.xpath( "//section[contains(@class,'m-product-placement-item f-size-medium context-app')]" ) # print(len(sections)) for section in sections: soup = BeautifulSoup(section.extract(), 'html.parser') try: item = AppstoreItem() item['name'] = soup.h3.text item['rating'] = soup.find('span', { 'itemprop': 'ratingValue' }).text item['url'] = urllib.parse.urljoin(response.url, soup.find('a')['href']) yield item except: pass
def parse_item(self, response): # print response.url page = Selector(response) item = AppstoreItem() item['title'] = page.xpath( '//ul[@class="app-info-ul nofloat"]/li/p/span[@class="title"]/text()' ).extract_first().encode('utf-8') item['url'] = response.url appid = re.match(r'http://.*/(.*)', item['url']).group(1) item['app_id'] = appid item['intro'] = page.xpath('//meta[@name="description"]/@content' ).extract_first().encode('utf-8') item['thumbnail_url'] = page.xpath( '//ul[@class="app-info-ul nofloat"]/li[@class="img"]/img[@class="app-ico"]/@lazyload' ).extract_first().encode('utf-8') item['developer'] = page.xpath( '//ul[@class="app-info-ul nofloat"]/li[@class="ul-li-detail"]/span/@title' ).extract_first().encode('utf-8') spans = page.xpath( '//ul[@class="app-info-ul nofloat"]/li/p/span/@class').extract() for s in spans: if s.startswith('score'): item['score'] = s.split('_')[1].encode('utf-8') break divs = page.xpath('//div[@class="open-info"]') recomm = "" for div in divs: url = div.xpath('./p[@class="name"]/a/@href').extract_first() recommended_appid = re.match(r'http://.*/(.*)', url).group(1) name = div.xpath( './p[@class="name"]/a/text()').extract_first().encode('utf-8') recomm += "{0}:{1},".format(recommended_appid, name) item['recommended'] = recomm yield item
def parse_item(self, response): page = Selector(response) item = AppstoreItem() item['title'] = page.xpath('//ul[@class="app-info-ul nofloat"]/li/p/span[@class="title"]/text()') \ .extract_first().encode('utf-8')