Python AppstoreItem示例，appstore.items.AppstoreItem Python示例

示例#1

0

显示文件

文件： huawei_spider.py 项目： automotua/mini-app-crawler

    def parse_item(self, response):
        """TODO: Docstring for parse_item.

        :response: TODO
        :returns: TODO

        """
        page = Selector(response)
        item = AppstoreItem()
        item['title'] = page.xpath('//ul[@class="app-info-ul nofloat"]/li/p/span[@class="title"]/text()'). \
            extract_first().encode('utf-8')
        item['url'] = response.url
        appid = re.match(r'http://.*/(.*)', item['url']).group(1)
        item['appid'] = appid
        item['intro'] = page.xpath('//meta[@name="description"]/@content'
                                   ).extract_first().encode('utf-8')

        divs = page.xpath('//div[@class="open-info"]')
        recomm = ""
        for div in divs:
            url = div.xpath('./p[@class="name"]/a/@href').extract_first()
            recommended_appid = re.match(r'http://.*/(.*)', url).group(1)
            name = div.xpath(
                './p[@class="name"]/a/text()').extract_first().encode('utf-8')
            recomm += "{0}:{1},".format(recommended_appid, name)
        item['recommended'] = recomm
        yield item

示例#2

0

显示文件

文件： huawei_spider.py 项目： tonymiao2012/appstore

    def parse_item(response):
        page = Selector(response)
        item = AppstoreItem()

        item['title'] = page.xpath(
            './/ul[@class="app-info-ul nofloat"]/li/p/span[@class="title"]/text()'
        ).extract_first().encode('utf-8')
        item['url'] = response.url
        #print type(item['url'])
        item['appid'] = re.match(r'http://.*/(.*)', item['url']).group(1)
        item['intro'] = page.xpath('//meta[@name="description"]/@content'
                                   ).extract_first().encode('utf-8')
        item['thumbnailurl'] = page.xpath(
            '//ul[@class="app-info-ul nofloat"]/li[@class="img"]/img[@class="app-ico"]/@lazyload'
        ).extract_first()
        divs = page.xpath('//div[@class="open-info"]')
        recomm = ""
        for div in divs:
            url = div.xpath('./p[class="name"]/a/@href').extract_first()
            #print type(url)
            recommended_appid = re.match(r'http://.*/(.*)', url).group(1)
            name = div.xpath(
                './p[@class="name"]/a/text()').extract_first().encode('utf-8')
            recomm += "{0}:{1},".format(recommended_appid, name)
        item['recommended'] = recomm

        yield item

示例#3

0

显示文件

文件： huawei_spider.py 项目： zshwuhan/AppstoreSpider

    def parse_item(self, response):
        page = Selector(response)
        item = AppstoreItem()
        #	print "page:%s"%page.get()
        item['title'] = page.xpath('//ul[@class="app-info-ul nofloat"]/li/p/span[@class="title"]/text()').\
                    extract_first().encode('utf-8')
        item['url'] = response.url
        print item['url']
        item['appid'] = page.xpath(
            '//input[@id="appId"]/@value').extract_first().encode('utf-8')
        #       item['appid'] = re.search('C\d*',item['url']).group()
        item['intro'] = page.xpath('//meta[@name="description"]/@content'
                                   ).extract_first().encode('utf-8')

        divs = page.xpath('//div[@class="open-info"]')
        recomm = ""
        #len(divs)==20, 10 for recommended and 10 for same type top10
        for div in divs[:10]:
            url = div.xpath('./p[@class="name"]/a/@href').extract_first()
            recom_appid = re.search('C\d*', url).group()
            name = div.xpath(
                './p[@class="name"]/a/@title').extract_first().encode('utf-8')
            recomm += "{0}:{1},".format(recom_appid, name)
        item['recommended'] = recomm
        yield item

示例#4

0

显示文件

    def parse(self, response):
        """
        response.body is a result of render.html call; it contains HTML processed by a browser.
        here we parse the html
        :param response:
        :return: request to detail page & request to next page if exists
        """
        page = Selector(response)
        divs = page.xpath('//div[@class="list-game-app dotline-btn nofloat"]')
        current_url = response.url

        # parse details
        count = 0
        for div in divs:
            if count >= 2:
                break
            item = AppstoreItem()
            info = div.xpath('.//div[@class="game-info  whole"]')
            detail_url = info.xpath(
                './h4[@class="title"]/a/@href').extract_first()
            item["url"] = detail_url
            req = Request(detail_url, callback=self.parse_detail_page)
            req.meta["item"] = item
            count += 1
            yield req

        # next page
        '''

示例#5

0

显示文件

    def parse(self, response):
        page = Selector(response)

        divs = page.xpath('//ul[@class="applist"]/li')

        for div in divs:
            item = AppstoreItem()
            item['title'] = div.xpath('./h5/a/text()').extract_first().encode(
                'utf-8')
            item['url'] = div.xpath('./h5/a/@href').extract_first()
            appid = re.match(r'/detail/(.*)', item['url']).group(1)
            item['appid'] = appid
            item['intro'] = div.xpath('.//p[@class="app-desc"]/a/text()'). \
                extract_first().encode('utf-8')
            yield item

示例#6

0

显示文件

    def parse(self, response):
        page = Selector(response)

        divs = page.xpath('//div[@class="game-info  whole"]')

        for div in divs:
            item = AppstoreItem()
            item['title'] = div.xpath('.//h4[@class="title"]/a/text()'). \
                extract_first().encode('utf-8')
            item['url'] = div.xpath(
                './/h4[@class="title"]/a/@href').extract_first()
            appid = re.match(r'http://.*/(.*)', item['url']).group(1)
            item['appid'] = appid
            item['intro'] = div.xpath('.//p[@class="content"]/text()'). \
                extract_first().encode('utf-8')
            yield item

示例#7

0

显示文件

    def parse(self, response):
        sel = Selector(response)
        toplist = []
        apps = sel.xpath('//ul[@class="ranklist"]/li')

        for app in apps:
            item = AppstoreItem()
            item['rank'] =  app.xpath('div/h3/span/text()').extract()[0]
            item['name'] = app.xpath('div/h3/a[@class="hd"]/text()').extract()[0]
            item['category'] = app.xpath('div/div[@class="intro"]/a[@class="intro-category"]/text()').extract()[0]
            item['size'] = app.xpath('div/div[@class="intro"]/p/text()').extract()[0]

            #process size so that the "大小" character is removed
            item['size'] = item['size'].split(u'\uff1a')[1]
            toplist.append(item)

        return toplist

示例#8

0

显示文件

文件： StoreSpider.py 项目： oscommonjs/AppStoreCrawler

 def parse_item(self, response):
     #print(response.url)
     selected = Selector(response=response).xpath(
         '//div[contains(@class, "c-group f-wrap-items context-list-page")]'
     )
     sections = selected.xpath(
         "//section[contains(@class,'m-product-placement-item f-size-medium context-app')]"
     )
     # print(len(sections))
     for section in sections:
         soup = BeautifulSoup(section.extract(), 'html.parser')
         try:
             item = AppstoreItem()
             item['name'] = soup.h3.text
             item['rating'] = soup.find('span', {
                 'itemprop': 'ratingValue'
             }).text
             item['url'] = urllib.parse.urljoin(response.url,
                                                soup.find('a')['href'])
             yield item
         except:
             pass

示例#9

0

显示文件

    def parse_item(self, response):
        # print response.url
        page = Selector(response)
        item = AppstoreItem()

        item['title'] = page.xpath(
            '//ul[@class="app-info-ul nofloat"]/li/p/span[@class="title"]/text()'
        ).extract_first().encode('utf-8')
        item['url'] = response.url
        appid = re.match(r'http://.*/(.*)', item['url']).group(1)
        item['app_id'] = appid
        item['intro'] = page.xpath('//meta[@name="description"]/@content'
                                   ).extract_first().encode('utf-8')
        item['thumbnail_url'] = page.xpath(
            '//ul[@class="app-info-ul nofloat"]/li[@class="img"]/img[@class="app-ico"]/@lazyload'
        ).extract_first().encode('utf-8')
        item['developer'] = page.xpath(
            '//ul[@class="app-info-ul nofloat"]/li[@class="ul-li-detail"]/span/@title'
        ).extract_first().encode('utf-8')
        spans = page.xpath(
            '//ul[@class="app-info-ul nofloat"]/li/p/span/@class').extract()
        for s in spans:
            if s.startswith('score'):
                item['score'] = s.split('_')[1].encode('utf-8')
                break
        divs = page.xpath('//div[@class="open-info"]')
        recomm = ""

        for div in divs:
            url = div.xpath('./p[@class="name"]/a/@href').extract_first()
            recommended_appid = re.match(r'http://.*/(.*)', url).group(1)
            name = div.xpath(
                './p[@class="name"]/a/text()').extract_first().encode('utf-8')
            recomm += "{0}:{1},".format(recommended_appid, name)
        item['recommended'] = recomm
        yield item

示例#10

0

显示文件

文件： huawei_spider.py 项目： Zhou42/BitTigerProject

    def parse_item(self, response):
        page = Selector(response)
        item = AppstoreItem()

        item['title'] = page.xpath('//ul[@class="app-info-ul nofloat"]/li/p/span[@class="title"]/text()') \
            .extract_first().encode('utf-8')