Python HtmlXPathSelector.extract示例，scrapy.selector.HtmlXPathSelector.extract Python示例

示例#1

0

显示文件

文件： Texas.py 项目： germ/ProjecTexas

    def parse_info(self, response):
        hxs     = HtmlXPathSelector(response)
        rows    = hxs.select("//table/tr")
        item    = TexasItem()

        #Remove the last from the identifing URL
        item['ident'] = response.url[0:-5]

        #Rip out the info, compare using lookup table
        for tr in rows:
            td  = tr.select("td/text()")
            l   = len(td)
            key = td[l-2].extract()
            val = td[l-1].extract()

            item[self.lookup[key]] = val

        values = hxs.select("//p/text()")
        keys = hxs.select("//p/span/text()")

        #Rip down the auxially data
        for i in range(len(keys)-1):
            key = keys[i].extract()
            val = values[i+1].extract()

            item[self.lookup[key]] = self.cleanString(val)

        #and lastly the mugshot
        hxs = hxs.select("//table/tr/td/img/@src")
        if len(hxs.extract()) >= 1:
            item['mugshot'] = "http://www.tdcj.state.tx.us/stat/dr_info/" + hxs.extract()[0]

        return item

示例#2

0

显示文件

文件： gadgets4everyone_spider.py 项目： 0--key/lib

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     deduct = self.get_data(hxs.extract(), 'productDeduc.push("','")')
     answers = self.get_data(hxs.extract(), 'productAns.push("','")')
     checks = hxs.select('//form[@name="customVal"]/table/tr/td[@style="padding-left:30px;"]/text()').extract()
     answers_deduct = zip(answers, deduct)
     options = zip(checks, answers_deduct)
     #Gets the prices.
     prices = zip(['Like New', 'Fair', 'Poor'], self.get_data(hxs.extract(), 'pp1=',';\n')[:-1])
     for grade, price in prices:
         loader = ProductLoader(item=Product(), response=response)
         name = hxs.select('//*[@id="vmMainPage"]/div/div/div/div/h1/text()').extract()[0]
         loader.add_value('name', ' '.join((name, grade)))
         loader.add_value('price', self.calc_price(float(price), options))
         loader.add_value('url', response.url)
         yield loader.load_item()

示例#3

0

显示文件

 def parse_products(self, response):
     hxs = HtmlXPathSelector(response)
     categories = hxs.select(
         '//p[@class="catname"]/strong/a/@href').extract()
     if categories:
         yield Request(response.url, dont_filter=True)
     else:
         tr = ('<tr><td colspan="3" align="center" class="pagenums">' +
               '<p class="pagenums">\r\n\t\t\t\t  ' +
               '<img src="images/clearpixel.gif" width="300" ' +
               'height="8" alt=""></p></td>\r\n\t\t\t  </tr>')
         tr_end = '<tr>' + hxs.select('//td[@class="prodseparator"]').\
                               extract()[0].decode('utf') + '</tr>'
         html = hxs.extract().replace(tr,'<table class="item">').\
                    replace(tr_end,'</table><table class="item">')
         products_hxs = HtmlXPathSelector(text=html)
         products = products_hxs.select('//table[@class="item"]')
         for product in products:
             name = product.select(
                 'tr/td/strong/div[@class="prodname"]/a/text()').extract()
             if name:
                 name = name[0]
                 url = product.select(
                     'tr/td/strong/div[@class="prodname"]/a/@href').extract(
                     )
                 if url:
                     url = url[0]
                 price_options = product.select(
                     'tr/td/form/script').extract()
                 if price_options:
                     price_values = self._get_prices(price_options[0])
                     for price, desc in price_values:
                         loader = ProductLoader(item=Product(),
                                                selector=product)
                         loader.add_value('name', ' '.join((name, desc)))
                         loader.add_value(
                             'url', urljoin_rfc(get_base_url(response),
                                                url))
                         loader.add_value('price', price)
                         yield loader.load_item()
                 else:
                     price = product.select(
                         'tr/td/div[@class="prodprice"]/span/text()'
                     ).extract()
                     if price:
                         price = price[0]
                     else:
                         price = 0.0
                     loader = ProductLoader(item=Product(),
                                            selector=product)
                     loader.add_value('name', name)
                     loader.add_value(
                         'url', urljoin_rfc(get_base_url(response), url))
                     loader.add_value('price', price)
                     yield loader.load_item()
     next = hxs.select(
         '//a[@class="ectlink" and @ rel="next"]/@href').extract()
     if next:
         url = urljoin_rfc(get_base_url(response), next[0])
         yield Request(url, callback=self.parse_products)

示例#4

0

显示文件

文件： mattressnextday_co_uk.py 项目： oceancloud82/scraping

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)

        product_list = hxs.select(u'//div[@class="box"]/div/a/@href').extract()
        if product_list:
            for url in product_list:
                url = urljoin_rfc(get_base_url(response), url)
                yield Request(url, callback=self.parse_product, meta=response.meta)
            return

        #fill main product fields
        product_loader = ProductLoader(item=Product(), selector=hxs)
        product_loader.add_value('url', response.url)
        product_loader.add_value('category', response.meta.get('category'))
        img = hxs.select('//div[@class="image"]/img/@src').extract()
        if img:
            product_loader.add_value('image_url', urljoin_rfc(get_base_url(response), img[0]))

        brand = ''.join(hxs.select('//li[contains(text(), "Brand")]/text()').extract()).replace('Brand: ', '')
        product_loader.add_value('brand', brand)
        name = hxs.select('//div[@class="description"]/strong/text()').extract()[0]
        product = product_loader.load_item()

        #construct url for ajax request to grab all product options
        pid = hxs.select('//input[@name="product_id"]/@value').extract()
        if pid:
            for line in hxs.extract().split('\n'):
                if 'var isclearance' in line.lower() and 'true' in line.lower():
                    clearance = 'true'
                else:
                    clearance = 'false'
            url = 'http://www.mattressnextday.co.uk/?route=api/product/sizes&timestamp={}&productId={}&callback=jQuery110206226998819969816_1389291112656&storeId=0&isClearance={}&_=1389291112657'.format(int(time.time()), pid[0], clearance)
            yield Request(url, meta={'product': product}, callback=self.get_product_options)
        else:
            self.log('ERROR! Unable to parse product ID from url: {}'.format(response.url))

示例#5

0

显示文件

文件： jarchive_spider.py 项目： etrain/JTrainer

  def parse_game(self, response):
    self.log("Found game page %s" % response.url)
    hxs = HtmlXPathSelector(response)
    clues = hxs.select('//td[@class="clue"]')
    jitems = []
    game = first(hxs.select('//div[@id="game_title"]/h1/text()').extract())
    cats = hxs.select('//td[@class="category_name"]/text()').extract()
    self.log(game)
    for clue in clues:
      jitem = JarchiveItem()
      found = clue.select('table/tr/td/div/@onmouseover').extract()

      if len(found) > 0:
        clueinfo = first(clue.select('.//td[@class="clue_text"]/@id').extract()).split("_")
        round = clueinfo[1]
        cluecol = int(clueinfo[2])-1
        if round == "DJ":
          cluecol+=6
        togglebox = found[0].split("', '")
        cr = HtmlXPathSelector(text=togglebox[2]).select(".//em[@class='correct_response']/text()")
        cr = first(cr.extract())
        v = first(clue.select('.//td[@class="clue_value"]/text()').extract())
        if v:
          v = v[1:]
        c = first(clue.select('.//td[@class="clue_text"]/text()').extract())
        (jitem['correct_response'], jitem['value'], jitem['clue'], jitem['game'], jitem['category']) = cr, v, c, game, cats[cluecol]
        jitems.append(jitem)
    return jitems

示例#6

0

显示文件

文件： granigfarm.py 项目： 0--key/lib

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        prod = hxs.extract()
        if prod:
            url = response.meta['prod_url']
            url = urljoin_rfc(self.start_urls[0], url)
            name = re.search('"title":"([^"]*)"+', prod).group()
            name = name.split(":")[1].strip('"').strip()
            if name:
                name_sufix = ''
                loader = ProductLoader(item=Product(), selector=hxs)
                loader.add_value('url', url)
                sku = re.search('"reference":"(?:[^\\"]+|\\.)*"', prod)#re.search('"reference":"(\d+-\d+\S\s+\w+)', prod)
                if not sku:
                    sku = re.search('"reference":"(\d+-\d+\S\w+)', prod)
                if sku:
                    sku = sku.group().split(':')[1].strip('",')    
                    loader.add_value('sku', sku)
                    loader.add_value('identifier', sku)
                    name_sufix = '-'.join(sku.split('-')[1:])
                if name_sufix:
                    loader.add_value('name', name+' ('+name_sufix+')')
                else:
                    loader.add_value('name', name)
                price = re.findall('"flat_price_inc":"(\d+.\d+)', prod)
                if price:
                    loader.add_value('price', price[0][:-1])
                yield loader.load_item()

示例#7

0

显示文件

文件： jarchive_spider.py 项目： imclab/JTrainer

    def parse_game(self, response):
        self.log("Found game page %s" % response.url)
        hxs = HtmlXPathSelector(response)
        clues = hxs.select('//td[@class="clue"]')
        jitems = []
        game = first(hxs.select('//div[@id="game_title"]/h1/text()').extract())
        cats = hxs.select('//td[@class="category_name"]/text()').extract()
        self.log(game)
        for clue in clues:
            jitem = JarchiveItem()
            found = clue.select('table/tr/td/div/@onmouseover').extract()

            if len(found) > 0:
                clueinfo = first(
                    clue.select(
                        './/td[@class="clue_text"]/@id').extract()).split("_")
                round = clueinfo[1]
                cluecol = int(clueinfo[2]) - 1
                if round == "DJ":
                    cluecol += 6
                togglebox = found[0].split("', '")
                cr = HtmlXPathSelector(text=togglebox[2]).select(
                    ".//em[@class='correct_response']/text()")
                cr = first(cr.extract())
                v = first(
                    clue.select('.//td[@class="clue_value"]/text()').extract())
                if v:
                    v = v[1:]
                c = first(
                    clue.select('.//td[@class="clue_text"]/text()').extract())
                (jitem['correct_response'], jitem['value'], jitem['clue'],
                 jitem['game'],
                 jitem['category']) = cr, v, c, game, cats[cluecol]
                jitems.append(jitem)
        return jitems

示例#8

0

显示文件

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return
        hxs = HtmlXPathSelector(response)

        prod = hxs.extract()
        if prod:
            url = response.meta['prod_url']
            url = urljoin_rfc(self.start_urls[0], url)
            name = re.search('"title":"([^"]*)"+', prod).group()
            name = name.split(":")[1].strip('"').strip()
            if name:
                name_sufix = ''
                loader = ProductLoader(item=Product(), selector=hxs)
                loader.add_value('url', url)
                sku = re.search(
                    '"reference":"(?:[^\\"]+|\\.)*"',
                    prod)  #re.search('"reference":"(\d+-\d+\S\s+\w+)', prod)
                if not sku:
                    sku = re.search('"reference":"(\d+-\d+\S\w+)', prod)
                if sku:
                    sku = sku.group().split(':')[1].strip('",')
                    loader.add_value('sku', sku)
                    loader.add_value('identifier', sku)
                    name_sufix = '-'.join(sku.split('-')[1:])
                if name_sufix:
                    loader.add_value('name', name + ' (' + name_sufix + ')')
                else:
                    loader.add_value('name', name)
                price = re.findall('"flat_price_inc":"(\d+.\d+)', prod)
                if price:
                    loader.add_value('price', price[0][:-1])
                yield loader.load_item()

示例#9

0

显示文件

文件： test_selector.py 项目： kenzouyeh/scrapy

    def test_null_bytes(self):
        hxs = HtmlXPathSelector(text='<root>la\x00la</root>')
        self.assertEqual(hxs.extract(),
                         u'<html><body><root>lala</root></body></html>')

        xxs = XmlXPathSelector(text='<root>la\x00la</root>')
        self.assertEqual(xxs.extract(),
                         u'<root>lala</root>')

示例#10

0

显示文件

文件： shoemetro_spider.py 项目： 0--key/lib

 def parse(self, response):
     base_url = get_base_url(response)
     hxs = HtmlXPathSelector(response)
     p = re.compile("of <b>(\d+)</b", re.IGNORECASE)
     total = p.findall(hxs.extract())
     pages = int(int(total[0]) / 96) + 2
     for i in range(1, pages):
         next_url = self.start_urls[0] + "&page=" + str(i)
         yield Request(next_url, meta={"cur": i, "attempt": 1}, callback=self.parse_items)

示例#11

0

显示文件

文件： shoemetro_spider.py 项目： ontiyonke/lib

 def parse(self, response):
     base_url = get_base_url(response)
     hxs = HtmlXPathSelector(response)
     p = re.compile('of <b>(\d+)</b', re.IGNORECASE)
     total = p.findall(hxs.extract())
     pages = int(int(total[0])/96) + 2
     for i in range(1, pages):
         next_url = self.start_urls[0] + "&page=" + str(i)
         yield Request(next_url, meta={'cur': i, 'attempt': 1}, callback=self.parse_items)

示例#12

0

显示文件

文件： digitalcamerawarehouse_spider.py 项目： 0--key/lib

 def parse_categories(self, response):
     hxs = HtmlXPathSelector(response)
     
     html = hxs.extract().replace('Sub Categories', '<div id="sub_categories">').replace('<p> </p>', '</div>')
     new_hxs = HtmlXPathSelector(text=html)
     sub_categories = new_hxs.select('//*[@id="sub_categories"]/a/@href').extract()
     for sub_category in sub_categories:
         url =  urljoin_rfc(get_base_url(response), sub_category)
         yield Request(url, self.parse_products)

示例#13

0

显示文件

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     deduct = self.get_data(hxs.extract(), 'productDeduc.push("', '")')
     answers = self.get_data(hxs.extract(), 'productAns.push("', '")')
     checks = hxs.select(
         '//form[@name="customVal"]/table/tr/td[@style="padding-left:30px;"]/text()'
     ).extract()
     answers_deduct = zip(answers, deduct)
     options = zip(checks, answers_deduct)
     #Gets the prices.
     prices = zip(['Like New', 'Fair', 'Poor'],
                  self.get_data(hxs.extract(), 'pp1=', ';\n')[:-1])
     for grade, price in prices:
         loader = ProductLoader(item=Product(), response=response)
         name = hxs.select(
             '//*[@id="vmMainPage"]/div/div/div/div/h1/text()').extract()[0]
         loader.add_value('name', ' '.join((name, grade)))
         loader.add_value('price', self.calc_price(float(price), options))
         loader.add_value('url', response.url)
         yield loader.load_item()

示例#14

0

显示文件

文件： test_selector.py 项目： kenzouyeh/scrapy

    def test_selector_over_text(self):
        hxs = HtmlXPathSelector(text='<root>lala</root>')
        self.assertEqual(hxs.extract(),
                         u'<html><body><root>lala</root></body></html>')

        xxs = XmlXPathSelector(text='<root>lala</root>')
        self.assertEqual(xxs.extract(),
                         u'<root>lala</root>')

        xxs = XmlXPathSelector(text='<root>lala</root>')
        self.assertEqual(xxs.select('.').extract(),
                         [u'<root>lala</root>'])

示例#15

0

显示文件

    def parse_categories(self, response):
        hxs = HtmlXPathSelector(response)

        html = hxs.extract().replace('Sub Categories',
                                     '<div id="sub_categories">').replace(
                                         '<p> </p>', '</div>')
        new_hxs = HtmlXPathSelector(text=html)
        sub_categories = new_hxs.select(
            '//*[@id="sub_categories"]/a/@href').extract()
        for sub_category in sub_categories:
            url = urljoin_rfc(get_base_url(response), sub_category)
            yield Request(url, self.parse_products)

示例#16

0

显示文件

文件： sellusyourgadget_spider.py 项目： oceancloud82/scraping

    def parse_subproducts(self, response):
        hxs = HtmlXPathSelector(response)
        #Fix for the HTML code.
        html = hxs.extract().replace('<br></h3>','').\
                             replace('<h3','<div class="item"').\
                             replace('</p>\n                                            <div','</p></div>\n    <div').\
                             replace('<input type="radio"', '<div class="hd" ').\
                             replace('checked>','>').\
                             replace('</p></div>','</div></p></div>').\
                             replace('</p>\n', '</div></p>\n')

        products_hxs = HtmlXPathSelector(text=html)
        products = products_hxs.select('//div[@class="item"]')
        for product in products:
            sub_products = product.select('div[@class="hd"]')
            if sub_products:
                for sub_product in sub_products:
                    value = sub_product.select('./@value').extract()[0]
                    hd = sub_product.select('./text()').extract()[0]
                    name = ' '.join(
                        (product.select('p/text()').extract()[0], hd))
                    extracted = process.extractOne(name, self.products)
                    try:
                        if extracted[1] >= 98:
                            url = 'http://sellusyourgadget.co.uk/index.php/home/getConditions/%s'
                            yield Request(url % value.split(':')[0],
                                          callback=self.parse_options,
                                          meta={
                                              'id': response.meta['id'],
                                              'name': name,
                                              'memoryR': value,
                                              'memory': value
                                          })
                    except TypeError:
                        return
            else:
                name = product.select('p/text()').extract()[0]
                extracted = process.extractOne(name, self.products)
                try:
                    if extracted[1] >= 98:
                        value = product.select('p/input/@value').extract()[0]
                        url = 'http://sellusyourgadget.co.uk/index.php/home/getConditions/%s'
                        yield Request(url % value.split(':')[0],
                                      callback=self.parse_options,
                                      meta={
                                          'id': response.meta['id'],
                                          'name': name,
                                          'memoryR': value,
                                          'memory': value
                                      })
                except TypeError:
                    return

示例#17

0

显示文件

文件： superSpider.py 项目： LeechanX/lcx-KNN-website-classifier-system

 def parse_detail(self, response):
     outputfile = self.output_file
     if not outputfile:
         log.msg("download %s fail" % response.url, level = log.WARNING, spider = self)
         return
     content_type=chardet.detect(response.body)
     hxs=HtmlXPathSelector(response)
     maindoing=''
     try:
         if content_type['encoding'] in ['ISO-8859-2','GB2312']:content_type['encoding']='gbk'
         maindoing=response.body.decode(content_type['encoding'])
     except Exception, e:
         maindoing=hxs.extract()

示例#18

0

显示文件

文件： archivo_spider.py 项目： HacksHackersRosario/lacapital-scraper

    def parse_article(self, response):
        self.log("Haciendo como que parseo el articulo %s" % response.url)
        hxs = HtmlXPathSelector(response)
        for k in keywords:
            if (re.search(k, hxs.extract())):
                self.log("El art. en %s contiene %s" % (response.url, k))
                title = hxs.select('//table')[5].select('.//table')[2].select('.//font')[0].select('.//font')[2].select('text()').extract()
                item = Article()
                item['title'] = title[0]
                item['url'] = response.url
                return item

        return None

示例#19

0

显示文件

    def parse_course_list(self, response, courselist):
        retval = []
        hxs = HtmlXPathSelector(response)

        courses = hxs.select('.//tr[descendant::font[@color="#0000FF"]]')
        length = len(courses)
        if length == 0:
            # no course to process
            return []

        data = hxs.extract().split(courses[0].extract())[1]

        if length == 1:
            course_details = [data]
        else:
            course_details = []
            for course in courses[1:]:
                s = data.split(course.extract())
                course_details.append(s[0])
                data = s[1]
            course_details.append(s[1])

        # sanity check
        assert (length == len(course_details))

        flags = re.UNICODE | re.MULTILINE  #| re.DOTALL
        for course, course_detail in zip(courses, course_details):
            code_title_au_dept = course.select('.//font/text()').extract()
            passfail = filter(
                None,
                re.findall(u'<font.*color="RED">([^<]*)', course_detail,
                           flags))
            mutex = filter(
                None,
                re.findall(u'<font.*color="BROWN">([^<]*)', course_detail,
                           flags))
            unavail = filter(
                None,
                re.findall(u'<font.*color="GREEN">([^<]*)', course_detail,
                           flags))
            prereq = filter(
                None,
                re.findall(u'<font.*color="#FF00FF">([^<]*)', course_detail,
                           flags))
            desc = re.search(u'<font size="2">([^<]*)',
                             course_detail).groups()[0]
            courseitem = self._fill_in(courselist, code_title_au_dept,
                                       passfail, mutex, unavail, prereq, desc)
            if courseitem:
                retval.append(courseitem)
        return retval

示例#20

0

显示文件

 def parse_products(self, response):
     hxs = HtmlXPathSelector(response)
     products = hxs.select('//td[@class="td"]/div[@style="width:750px;'
                           ' padding: 10px 0px 10px 20px; "]/'
                           'table [@width="80%" and @cellpadding="4" and'
                           ' @border="0" and @align="center"]')
     if products:
         for product in products:
             loader = ProductLoader(item=Product(), selector=product)
             loader.add_xpath('name', 'tr/td/table/tr/td/strong/a/text()')
             loader.add_xpath('name', 'tr/td/div/strong/a/text()')
             url = product.select(
                 'tr/td/table/tr/td/strong/a/@href').extract()
             if url:
                 url = urljoin_rfc(get_base_url(response), url[0])
             else:
                 url = product.select('tr/td/div/strong/a/@href').extract()
                 if url:
                     url = urljoin_rfc(get_base_url(response), url[0])
             loader.add_value('url', url)
             loader.add_xpath('price',
                              'tr/td/div[@class="HeadingText"]/text()')
             yield loader.load_item()
     else:
         try:
             categories = hxs.select(
                 '//td[@class="td"]/div[@style="width:750px;'
                 ' padding: 10px 0px 10px 20px; "]/'
                 'table[@cellpadding="5"]')
             if categories:
                 for category in categories:
                     url = urljoin_rfc(
                         get_base_url(response),
                         category.select(
                             'tr/td/a[@class="HeadingText"]/@href').extract(
                             )[0])
                     yield Request(url,
                                   dont_filter=True,
                                   callback=self.parse_products)
         except IndexError:
             pass
     html = hxs.extract().replace('Sub Categories',
                                  '<div id="sub_categories">').replace(
                                      '<p> </p>', '</div>')
     new_hxs = HtmlXPathSelector(text=html)
     sub_categories = new_hxs.select(
         '//*[@id="sub_categories"]/a/@href').extract()
     for sub_category in sub_categories:
         url = urljoin_rfc(get_base_url(response), sub_category)
         yield Request(url, dont_filter=True, callback=self.parse_products)

示例#21

0

显示文件

    def parse_article(self, response):
        self.log("Haciendo como que parseo el articulo %s" % response.url)
        hxs = HtmlXPathSelector(response)
        for k in keywords:
            if (re.search(k, hxs.extract())):
                self.log("El art. en %s contiene %s" % (response.url, k))
                title = hxs.select('//table')[5].select('.//table')[2].select(
                    './/font')[0].select('.//font')[2].select(
                        'text()').extract()
                item = Article()
                item['title'] = title[0]
                item['url'] = response.url
                return item

        return None

示例#22

0

显示文件

文件： sellusyourgadget_spider.py 项目： 0--key/lib

    def parse_subproducts(self, response):
        hxs = HtmlXPathSelector(response)
        # Fix for the HTML code.
        html = (
            hxs.extract()
            .replace("<br></h3>", "")
            .replace("<h3", '<div class="item"')
            .replace("</p>\n                                            <div", "</p></div>\n    <div")
            .replace('<input type="radio"', '<div class="hd" ')
            .replace("checked>", ">")
            .replace("</p></div>", "</div></p></div>")
            .replace("</p>\n", "</div></p>\n")
        )

        products_hxs = HtmlXPathSelector(text=html)
        products = products_hxs.select('//div[@class="item"]')
        for product in products:
            sub_products = product.select('div[@class="hd"]')
            if sub_products:
                for sub_product in sub_products:
                    value = sub_product.select("./@value").extract()[0]
                    hd = sub_product.select("./text()").extract()[0]
                    name = " ".join((product.select("p/text()").extract()[0], hd))
                    extracted = process.extractOne(name, self.products)
                    try:
                        if extracted[1] >= 98:
                            url = "http://sellusyourgadget.co.uk/index.php/home/getConditions/%s"
                            yield Request(
                                url % value.split(":")[0],
                                callback=self.parse_options,
                                meta={"id": response.meta["id"], "name": name, "memoryR": value, "memory": value},
                            )
                    except TypeError:
                        return
            else:
                name = product.select("p/text()").extract()[0]
                extracted = process.extractOne(name, self.products)
                try:
                    if extracted[1] >= 98:
                        value = product.select("p/input/@value").extract()[0]
                        url = "http://sellusyourgadget.co.uk/index.php/home/getConditions/%s"
                        yield Request(
                            url % value.split(":")[0],
                            callback=self.parse_options,
                            meta={"id": response.meta["id"], "name": name, "memoryR": value, "memory": value},
                        )
                except TypeError:
                    return

示例#23

0

显示文件

 def parsePage(self, response):
     hxs = HtmlXPathSelector(response)
     
     item = response.meta['item']
     emails = collectAllEmail(hxs.extract())
     if len(emails) > 0:
         item['email'] = emails[0]
         yield item
        
     extractor = SgmlLinkExtractor(allow_domains=response.url)
     
     for entry in extractor.extract_links(response):
         if entry.url is not None:
             req = Request(entry.url, callback=self.parsePage)
             req.meta['item'] = item
             yield req

示例#24

0

显示文件

 def parse_detail(self, response):
     outputfile = self.output_file
     if not outputfile:
         log.msg("download %s fail" % response.url,
                 level=log.WARNING,
                 spider=self)
         return
     content_type = chardet.detect(response.body)
     hxs = HtmlXPathSelector(response)
     maindoing = ''
     try:
         if content_type['encoding'] in ['ISO-8859-2', 'GB2312']:
             content_type['encoding'] = 'gbk'
         maindoing = response.body.decode(content_type['encoding'])
     except Exception, e:
         maindoing = hxs.extract()

示例#25

0

显示文件

文件： wharfaquatics_spider.py 项目： 0--key/lib

 def parse_products(self, response):
     hxs = HtmlXPathSelector(response)
     categories = hxs.select('//p[@class="catname"]/strong/a/@href').extract()
     if categories:
         yield Request(response.url, dont_filter=True)
     else:
         tr = ('<tr><td colspan="3" align="center" class="pagenums">'+
              '<p class="pagenums">\r\n\t\t\t\t  '+
              '<img src="images/clearpixel.gif" width="300" '+
              'height="8" alt=""></p></td>\r\n\t\t\t  </tr>')
         tr_end = '<tr>' + hxs.select('//td[@class="prodseparator"]').\
                               extract()[0].decode('utf') + '</tr>'
         html = hxs.extract().replace(tr,'<table class="item">').\
                    replace(tr_end,'</table><table class="item">')
         products_hxs = HtmlXPathSelector(text=html)
         products = products_hxs.select('//table[@class="item"]')
         for product in products:
             name = product.select('tr/td/strong/div[@class="prodname"]/a/text()').extract()
             if name:
                 name = name[0]
                 url = product.select('tr/td/strong/div[@class="prodname"]/a/@href').extract()
                 if url:
                     url = url[0]
                 price_options = product.select('tr/td/form/script').extract()
                 if price_options:
                     price_values = self._get_prices(price_options[0])
                     for price, desc in price_values:
                         loader = ProductLoader(item=Product(), selector=product)
                         loader.add_value('name', ' '.join((name,desc)))
                         loader.add_value('url', urljoin_rfc(get_base_url(response), url))
                         loader.add_value('price', price)
                         yield loader.load_item()
                 else:
                     price = product.select('tr/td/div[@class="prodprice"]/span/text()').extract()
                     if price:
                         price = price[0]
                     else:
                         price = 0.0
                     loader = ProductLoader(item=Product(), selector=product)   
                     loader.add_value('name', name)
                     loader.add_value('url', urljoin_rfc(get_base_url(response), url))
                     loader.add_value('price', price)
                     yield loader.load_item()
     next = hxs.select('//a[@class="ectlink" and @ rel="next"]/@href').extract()
     if next:
         url =  urljoin_rfc(get_base_url(response), next[0])
         yield Request(url, callback=self.parse_products)

示例#26

0

显示文件

文件： mattressnextday_co_uk.py 项目： oceancloud82/scraping

 def parse_product_list(self, response):
     hxs = HtmlXPathSelector(response)
     path = ''
     pattern = r"'([A-Za-z0-9_\./\\-]*)'"
     for line in hxs.extract().split('\n'):
         if 'path = ' in ' '.join(line.split()):
             text = ' '.join(line.split())
             path = re.search(pattern, text).group().replace("'", '')
     if path:
         category_id = hxs.select('//select[@name="sort_by"]/@id').extract()[0].replace('sort_by_','')
         product_list_url = ('http://www.mattressnextday.co.uk/index.php?route=api/' +
                            'category/getProducts&sort_by=price_low_to_high&'+
                            'category_id='+category_id+'&price_range=all&'+
                            'layout=grid&path='+path+'&per_page=1000&page=1')
         yield Request(product_list_url, callback=self.parse_product_list, meta=response.meta)
     
     for url in hxs.select('//div[@class="name"]/a/@href').extract():
         url = urljoin_rfc(get_base_url(response), url)
         yield Request(url, callback=self.parse_product, meta=response.meta)

示例#27

0

显示文件

文件： burton_spider.py 项目： marjevtic/testMarko

 def parse(self, response):
     self.counter += 1
     basic.print_status(self.counter, self.total)
     hxs = HtmlXPathSelector(response)
     item = BurtonItem()
     page = hxs.extract()
     if 'redirect_urls' in response.request.meta:
         cur_url = response.request.meta['redirect_urls'][0]
     else:
         cur_url = response.url
     index = self.products['urls'].index(cur_url)
     try:
         if 'redirect_urls' in response.request.meta:
             item['product_id'] = [self.products['product_ids'][index]]
             item['name'] = [self.products['names'][index]]
             item['in_stock'] = ["NOT_AVAILABLE"]
             self.exc.code_handler(102, response.url)
             self.xml.create_xml(item)
             self.products["status"][index] = "no_avail"
         else:
             item['product_id'], item['name'] = self.get_basic_info(hxs)
             item['description'], item['features'] = self.get_description(
                 hxs)
             item['variants'], thumb_urls, color_names = self.get_variants(
                 page)
             item['all_sizes'] = self.get_all_sizes(page)
             item['color_json'], image_urls = self.get_colors(
                 page, color_names)
             item['price'], item['old_price'] = self.get_prices(hxs)
             item['in_stock'] = ['IN_STOCK']
             item['product_link'] = [basic.cdata(response.url)]
             self.xml.create_xml(item)
             item['image_urls'] = image_urls + thumb_urls
             self.products["status"][index] = "ran"
     except:
         self.exc.code_handler(100, response.url)
         self.products["status"][index] = "error"
     else:
         return item

示例#28

0

显示文件

文件： digitalcamerawarehouse_spider.py 项目： 0--key/lib

 def parse_products(self, response):
     hxs = HtmlXPathSelector(response)
     products = hxs.select('//td[@class="td"]/div[@style="width:750px;'
                           ' padding: 10px 0px 10px 20px; "]/'
                           'table [@width="80%" and @cellpadding="4" and'
                           ' @border="0" and @align="center"]')
     if products:
         for product in products:
             loader = ProductLoader(item=Product(), selector=product)
             loader.add_xpath('name', 'tr/td/table/tr/td/strong/a/text()')
             loader.add_xpath('name', 'tr/td/div/strong/a/text()')
             url = product.select('tr/td/table/tr/td/strong/a/@href').extract()
             if url:
                 url = urljoin_rfc(get_base_url(response), url[0])
             else:
                 url = product.select('tr/td/div/strong/a/@href').extract()
                 if url:
                     url = urljoin_rfc(get_base_url(response), url[0])
             loader.add_value('url', url)
             loader.add_xpath('price', 'tr/td/div[@class="HeadingText"]/text()')
             yield loader.load_item()
     else:
        try: 
            categories = hxs.select('//td[@class="td"]/div[@style="width:750px;'
                                 ' padding: 10px 0px 10px 20px; "]/'
                                 'table[@cellpadding="5"]')
            if categories:
                for category in categories:
                    url = urljoin_rfc(get_base_url(response), 
                                      category.select('tr/td/a[@class="HeadingText"]/@href').extract()[0])
                    yield Request(url, dont_filter=True, callback=self.parse_products)
        except IndexError:
            pass
     html = hxs.extract().replace('Sub Categories', '<div id="sub_categories">').replace('<p> </p>', '</div>')
     new_hxs = HtmlXPathSelector(text=html)
     sub_categories = new_hxs.select('//*[@id="sub_categories"]/a/@href').extract()
     for sub_category in sub_categories:
         url =  urljoin_rfc(get_base_url(response), sub_category)
         yield Request(url, dont_filter=True, callback=self.parse_products)

示例#29

0

显示文件

文件： course_content_spider.py 项目： mrordinaire/STARS-gazer

    def parse_course_list(self, response, courselist):
        retval = []
        hxs = HtmlXPathSelector(response)

        courses = hxs.select('.//tr[descendant::font[@color="#0000FF"]]')
        length = len(courses)
        if length == 0:
            # no course to process
            return []

        data = hxs.extract().split(courses[0].extract())[1]

        if length == 1:
            course_details = [data]
        else:
            course_details = []
            for course in courses[1:]:
                s = data.split(course.extract())
                course_details.append(s[0])
                data = s[1]
            course_details.append(s[1])

        # sanity check
        assert(length == len(course_details))

        flags = re.UNICODE | re.MULTILINE #| re.DOTALL
        for course, course_detail in zip(courses, course_details):
            code_title_au_dept = course.select('.//font/text()').extract()
            passfail = filter(None, re.findall(u'<font.*color="RED">([^<]*)', course_detail, flags))
            mutex = filter(None, re.findall(u'<font.*color="BROWN">([^<]*)', course_detail, flags))
            unavail = filter(None, re.findall(u'<font.*color="GREEN">([^<]*)', course_detail, flags))
            prereq = filter(None, re.findall(u'<font.*color="#FF00FF">([^<]*)', course_detail, flags))
            desc = re.search(u'<font size="2">([^<]*)', course_detail).groups()[0]
            courseitem = self._fill_in(courselist, code_title_au_dept, passfail, mutex, unavail, prereq, desc)
            if courseitem:
                retval.append(courseitem)
        return retval

示例#30

0

显示文件

文件： burton_spider.py 项目： marjevtic/testMarko

 def parse(self, response):
     self.counter += 1
     basic.print_status(self.counter, self.total)
     hxs = HtmlXPathSelector(response)
     item = BurtonItem()
     page = hxs.extract()
     if 'redirect_urls' in response.request.meta:
         cur_url = response.request.meta['redirect_urls'][0]
     else:
         cur_url = response.url
     index = self.products['urls'].index(cur_url)
     try:
         if 'redirect_urls' in response.request.meta:
             item['product_id'] = [self.products['product_ids'][index]]
             item['name'] = [self.products['names'][index]]
             item['in_stock'] = ["NOT_AVAILABLE"]
             self.exc.code_handler(102, response.url)
             self.xml.create_xml(item)
             self.products["status"][index] = "no_avail"
         else:
             item['product_id'], item['name'] = self.get_basic_info(hxs)
             item['description'], item['features'] = self.get_description(hxs)
             item['variants'], thumb_urls, color_names = self.get_variants(page)
             item['all_sizes'] = self.get_all_sizes(page)
             item['color_json'], image_urls = self.get_colors(page, color_names)
             item['price'], item['old_price'] = self.get_prices(hxs)
             item['in_stock'] = ['IN_STOCK']
             item['product_link'] = [basic.cdata(response.url)]
             self.xml.create_xml(item)
             item['image_urls'] = image_urls + thumb_urls
             self.products["status"][index] = "ran"
     except:
         self.exc.code_handler(100, response.url)
         self.products["status"][index] = "error"
     else:
         return item

示例#31

0

显示文件

    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)
        cat_name = response.meta["cat_name"]

        # Fill up the Product model fields
        #identifier =
        url = response.url
        name = hxs.select("//div[contains(@class, 'details')]/h1/text()"
                          ).extract()[0].replace("-", " ").strip()
        price = hxs.select(
            "//div[@class='price-info']/strong[@id='price']/text()").extract(
            )[0].strip()

        #sku =
        #metadata =
        category = cat_name
        image_url = hxs.select("//div[@class='main-image']/a/@href").extract()
        if not image_url:
            image_url = hxs.select(
                "//div[@class='main-image']/img/@src").extract()
            if not image_url:
                image_url = ""
        brand = hxs.select(
            "//div[contains(@class, 'details')]/h1/strong/text()").extract()
        shipping_cost = hxs.select(
            "//dl[@class='blue']/dd/text()").extract()[0].strip()
        self.log(price)
        if 'Cart' in price:
            products = []
            for line in hxs.extract().split('\n'):
                if "Add to Cart" in line:
                    product = re.findall('"([A-Za-z0-9 _\./\\-]*)"', line)
                    if product:
                        products.append(product[:1] + product[3:])

            if products:

                self.log("products: " + str(products))
                yield Request(url,
                              dont_filter=True,
                              callback=self.parse_add_products,
                              meta={
                                  'products': products,
                                  'url': url,
                                  'brand': brand,
                                  'category': category
                              })
        else:

            b_product_id = hxs.select(
                '//input[@id="b_product_id"]/@value').extract()

            o = urlparse(url)
            params = parse_qs(o.query)

            cur_option_id = ""
            if "v" in params:
                self.log("option v found")
                cur_option_name = params["v"]
                if cur_option_name:
                    cur_option_id = cur_option_name[0].strip().lower()

            product_id = hxs.select(
                '//input[@id="product_id"]/@value').extract()
            if not b_product_id:
                self.log("ERROR b_product id not found")
            else:

                res_product_id = (b_product_id[0] + " " +
                                  cur_option_id).strip()

                #l.add_value('identifier', res_product_id)

            size_option = hxs.select(
                '//fieldset/div/div[label/text()="\r\n                        \r\n                            Size:\r\n                        "]/select'
            )
            if size_option:
                sizes = []
                for line in response.body.split('\n'):
                    if 'products[' in line and 'new Array' in line:
                        sizes.append(
                            ast.literal_eval(
                                line.split('Array')[-1].split(';')[0]))

                for size in sizes:
                    l = ProductLoader(response=response, item=Product())
                    #l.add_value('identifier', identifier)
                    l.add_value('url', url)
                    l.add_value('name', name + ' ' + size[3] + ' ' + size[4])
                    l.add_value('identifier', res_product_id + '-' + size[0])
                    l.add_value('price', size[1])
                    l.add_value('category', category)
                    l.add_value('image_url', image_url)
                    l.add_value('brand', brand)
                    l.add_value('shipping_cost', shipping_cost)

                    if size[1]:
                        l.add_value('stock', '1')
                    else:
                        l.add_value('stock', '0')

                    yield l.load_item()
            else:
                l = ProductLoader(response=response, item=Product())
                #l.add_value('identifier', identifier)
                l.add_value('url', url)
                l.add_value('identifier', res_product_id)
                l.add_value('name', name)
                l.add_value('price', price)
                #l.add_value('sku', sku)
                #l.add_value('metadata', metadata)
                l.add_value('category', category)
                l.add_value('image_url', image_url)
                l.add_value('brand', brand)
                l.add_value('shipping_cost', shipping_cost)

                if price:
                    l.add_value('stock', '1')
                else:
                    l.add_value('stock', '0')

                yield l.load_item()

示例#32

0

显示文件

    def parse_products(self, response):
        hxs = HtmlXPathSelector(response)
        products = hxs.select(
            '//div[@id="ProductDetails"]/div[@id="ProductDetails"]')
        if products:
            for product in products:
                url = product.select(
                    'div/div[@id="ProductName"]/h2/a/@href').extract()
                if url:
                    url = urljoin_rfc(get_base_url(response), url[0])
                else:
                    url = product.select('tr/td/div/strong/a/@href').extract()
                    if url:
                        url = urljoin_rfc(get_base_url(response), url[0])
                yield Request(url, callback=self.parse_product)
                """
                try:
                    identifier = re.search(r'/prod(\d+)', url).groups()[0]
                except:
                    # Options
                    yield Request(url, callback=self.parse_products)
                    continue

                loader = ProductLoader(item=Product(), selector=product)
                loader.add_xpath('name', 'div/div[@id="ProductName"]/h2/a/text()')
                loader.add_value('url', url)
                price = product.select('div/div/div[@class="Cart-ProductCost"]/text()').extract()
                if price:
                    price = price[0]
                else:
                    price = '0.0'
                loader.add_value('price', price)
                category = hxs.select('//div[@class="SiteMap"]/a/text()').extract()
                category = category[-2] if category else ''
                loader.add_value('category', category)
                loader.add_value('identifier', identifier)

                try:
                    loader.add_value('image_url',
                                     urljoin_rfc(get_base_url(response),
                                                 product.select('.//a/img/@src')
                                                 .extract()[0]))
                except:
                    pass

                out_stock = product.select('.//div[@class="Cart-Special-Note" and contains(text(), "This product is coming soon")]/text()').extract()
                if out_stock:
                    loader.add_value('stock', 0)

                yield loader.load_item()
                """
        else:
            try:
                categories = hxs.select(
                    '//td[@class="td"]/div[@style="width:750px;'
                    ' padding: 10px 0px 10px 20px; "]/'
                    'table[@cellpadding="5"]')
                if categories:
                    for category in categories:
                        url = urljoin_rfc(
                            get_base_url(response),
                            category.select(
                                'tr/td/a[@class="HeadingText"]/@href').extract(
                                )[0])
                        yield Request(url,
                                      dont_filter=True,
                                      callback=self.parse_products)
            except IndexError:
                pass

        sub_categories = hxs.select(
            '//div[@class="CategoryContainer"]//tr/td/font/a/@href').extract()
        sub_categories = hxs.select(
            '//div[@id="ProductDetails"]/a/@href').extract()
        for sub_category in sub_categories:
            url = urljoin_rfc(get_base_url(response), sub_category)
            yield Request(url, callback=self.parse_products)

        html = hxs.extract().replace('Sub Categories',
                                     '<div id="sub_categories">').replace(
                                         '<p> </p>', '</div>')
        new_hxs = HtmlXPathSelector(text=html)
        sub_categories = new_hxs.select(
            '//*[@id="sub_categories"]/a/@href').extract()
        for sub_category in sub_categories:
            url = urljoin_rfc(get_base_url(response), sub_category)
            yield Request(url, dont_filter=True, callback=self.parse_products)

        sub_categories = hxs.select(
            '//div[@id="ProductDetails"]/a/@href').extract()
        for sub_category in sub_categories:
            url = urljoin_rfc(get_base_url(response), sub_category)
            yield Request(url, callback=self.parse_products)

示例#33

0

显示文件

文件： search.py 项目： satvikc/Movie-location-search-

def search(city,date):
    try:
        movielist=[]
        start=0

        conn = httplib.HTTPConnection("www.google.com")
        conn.request("GET", "/movies?near="+city+"&start="+str(start)+"&date="+str(date))
        r1=conn.getresponse()
        hxs=HtmlXPathSelector(HtmlResponse("www.google.com/movies?near="+city+"&date="+str(date)+"&start="+str(start),r1.status,r1.getheaders(),r1.read(),request=conn))
        st=hxs.extract()
        while("No showtimes were found" not in st):
            theater=hxs.select('//h2[contains(@class,"name")]/a[contains(@href,"movies")]/text()').extract()
            m=hxs.select('//div[contains(@class,"name")]/a[contains(@href,"movies")]/text()').extract()
            x=hxs.select('//div[contains(@class,"times")]/text()').extract()
            counter=0
            for i,j in zip(theater,theater[1:]):
                star=st.find(i)
                end=st.find(j)
                test=st.find(m[counter],star,end)
                while(test!=-1):
                    temp=x[counter].split()
                    try:
                        am=next(temp1 for temp1,temp2 in enumerate(temp) if temp2.endswith("am"))
                        temp[am]=temp[am][:-2]
                    except:
                        am=0
                    try:
                        pm=next(temp1 for temp1,temp2 in enumerate(temp) if temp2.endswith("pm"))
                        temp[pm]=temp[pm][:-2]
                        if int(temp[pm].split(':')[0])<12:
                            temp[pm]=str((int(temp[pm].split(':')[0])+12)%24)+temp[pm][temp[pm].find(':'):]
                        for temp3 in range(am+1,pm):
                            if int(temp[temp3].split(':')[0])<12:
                                temp[temp3]=str((int(temp[temp3].split(':')[0])+12)%24)+temp[temp3][temp[temp3].find(':'):]
                    except:
                        pass
                    movielist.append((i,m[counter],temp))

                    star =test+len(m[counter])
                    counter+=1
                    test=st.find(m[counter],star,end)
            while(counter!=len(m)):
                temp=x[counter].split()
                try:
                    am=next(temp1 for temp1,temp2 in enumerate(temp) if temp2.endswith("am"))
                    temp[am]=temp[am][:-2]
                except:
                    am=0
                try:
                    pm=next(temp1 for temp1,temp2 in enumerate(temp) if temp2.endswith("pm"))
                    temp[pm]=temp[pm][:-2]
                    for temp3 in range(am+1,pm+1):
                        if int(temp[temp3].split(':')[0])<12:
                            temp[temp3]=str((int(temp[temp3].split(':')[0])+12)%24)+temp[temp3][temp[temp3].find(':'):]
                except:
                    pass
                movielist.append((theater[-1],m[counter],temp))
                counter+=1
            #pdb.set_trace()
            conn.close()
            start+=10
            conn = httplib.HTTPConnection("www.google.com")
            conn.request("GET", "/movies?near="+city+"&start="+str(start)+"&date="+str(date))
            r1=conn.getresponse()
            hxs=HtmlXPathSelector(HtmlResponse("www.google.com/movies?near="+city+"&date="+str(date)+"&start="+str(start),r1.status,r1.getheaders(),r1.read(),request=conn))
            st=hxs.extract()
        #return json.JSONEncoder().encode(movielist)
        return json.dumps(movielist, sort_keys=True, indent=4)
    except:
        pass

示例#34

0

显示文件

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)

        loader = ProductLoader(item=Product(), selector=hxs)
        name = hxs.select(
            '//div[@class="product-name"]/h1/text()').extract()[0]
        url = response.url
        price = hxs.select(
            '//div[@class="product-shop"]//div[@class="wrapper-price-share"]'
            '//div[@class="price-box"]//span[contains(@id, "product-price")]'
            '/text()').re('\xa3(.*)')
        if price:
            price = price[0]
        else:
            price = hxs.select('//span[@class="price"]/text()').re(
                '\xa3(.*)')[0]
        sku = hxs.select('//meta[@itemprop="productID"]/@content').re(
            'sku:(.*)')
        category = hxs.select(
            '//div[@class="breadcrumbs"]//a/text()').extract()[-1]
        brand = hxs.select('//div[@class="product-brand"]/img/@src').extract()
        if brand:
            brand = brand[0].split('/')[-1].lower()
            brand = brand.replace('_', ' ').replace('-logo', '#').replace(
                ' logo', '#').split('#')[0].title()
        else:
            brand = ''
        loader.add_value('brand', brand)
        loader.add_value('url', url)
        loader.add_value('name', name)
        loader.add_value('price', price)
        loader.add_value('sku', sku)
        loader.add_value('identifier', sku)
        loader.add_value('category', category)
        stock = hxs.select('//div[@class="pdp-info-icon in-stock"]').extract()
        if not stock:
            loader.add_value('stock', 0)
        image_url = hxs.select(
            '//meta[@property="og:image"]/@content').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])
        options = hxs.select('//div[@class="input-box"]/select')
        main_category = hxs.select(
            '//div[@class="breadcrumbs"]//a/text()').extract()
        main_category = main_category[1] if len(main_category) > 1 else ''
        if options and main_category == 'Essentials':
            log.msg('CRAWL PRODUCT OPTIONS')
            options = json.loads(hxs.extract().partition('Product.Config(')
                                 [-1].partition(');')[0])
            #options_number_key = '157'
            #if options_number_key not in options['attributes']:
            #options_number_key = '187'
            options_number_key = response.xpath('//select/@id').re(
                'attribute(\d+)')[0]
            options = options['attributes'][options_number_key]['options']
            for option in options:
                product = loader.load_item()
                product['identifier'] = product['identifier'] + '-' + option[
                    'label']
                product['name'] = product['name'] + ' ' + option['label']
                product['price'] = float(product['price']) + float(
                    option['price'])
                option_loader = ProductLoader(item=product, response=response)
                item = option_loader.load_item()
                metadata = SimplyPleasureMeta()
                metadata['cost_price'] = self.cost_prices.get(
                    item['identifier'])
                item['metadata'] = metadata
                yield item
        else:
            item = loader.load_item()
            metadata = SimplyPleasureMeta()
            metadata['cost_price'] = self.cost_prices.get(item['identifier'])
            item['metadata'] = metadata
            yield item

示例#35

0

显示文件

 def _query(xpath, response, extract=True):
     ret = HtmlXPathSelector(response).select(xpath)
     return ret.extract() if extract else ret

示例#36

0

显示文件

文件： bdmmspider.py 项目： E312232595/baidu-music-spider

 def _query(xpath, response, extract=True):
     ret = HtmlXPathSelector(response).select(xpath)
     return ret.extract() if extract else ret

示例#37

0

显示文件

    def parse(self, response):
        try:
            hxs = HtmlXPathSelector(response)
        except AttributeError:
            msg = 'Error getting selector on page for row: %s' % response.meta[
                'row']
            self.log('[ERROR] %s' % msg)
            self.errors.append(msg)
            return

        row = response.meta['row']

        json_data = None
        for line in hxs.extract().split('\n'):
            if "JsonObject = " in line:
                json_data = json.loads(
                    line.replace('JsonObject = ', '').replace('; \r', ''))

        products = json_data['Rest'] + json_data['Deals']

        collected_products = []

        for product_info in products:
            # skip winter tyres
            if product_info['WinterTyre']:
                continue

            loader = ProductLoader(item=Product(), selector=product_info)
            loader.add_value('name', product_info['ModelName'])
            brand = product_info['Manufacturer']

            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            identifier = product_info['PrimaryId']
            fitting_method = 'Delivered'

            url = '/catalogue' + product_info[
                'CatalogueUrl'] + '/f?tyre=' + str(product_info['PrimaryId'])
            loader.add_value('url', urljoin(get_base_url(response), url))

            image_url = product_info.get('ModelImageLarge')
            if not image_url:
                image_url = product_info.get('ModelImage')

            if image_url:
                image_url = image_url.split('src="')[-1].split('"')[0]
                loader.add_value('image_url',
                                 urljoin(get_base_url(response), image_url))

            loader.add_value('identifier',
                             str(identifier) + '-' + fitting_method)
            price = product_info['SellingPrice']
            loader.add_value('price', price)

            spec = product_info['SpecificationName']

            metadata = MicheldeverMeta()
            # metadata['mts_stock_code'] = row['MTS Stockcode']
            metadata['aspect_ratio'] = row['Aspect Ratio']
            metadata['rim'] = row['Rim']
            metadata['speed_rating'] = spec.split()[-1]
            metadata['width'] = row['Width']

            metadata['fitting_method'] = fitting_method
            load_rating = product_info['LoadRatingName']
            metadata['load_rating'] = load_rating
            metadata['alternative_speed_rating'] = ''
            xl = product_info['Reinforced']
            metadata['xl'] = 'Yes' if xl else 'No'
            run_flat = product_info['RunFlat']
            metadata['run_flat'] = 'Yes' if run_flat else 'No'
            manufacturer_mark = product_info['Variant']
            if manufacturer_mark:
                manufacturer_mark = manufacturer_mark.split()[0].strip()

            metadata['manufacturer_mark'] = find_man_mark(
                manufacturer_mark) if manufacturer_mark else ''

            metadata['full_tyre_size'] = '/'.join(
                (row['Width'], row['Aspect Ratio'], row['Rim'],
                 metadata['load_rating'], metadata['speed_rating']))

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating

            # Do not collect "Delivered" tyres
            # yield product

            product['price'] = product_info['FullyFittedPrice']
            fitting_method = 'Fitted'
            product['identifier'] = str(identifier) + '-' + fitting_method
            product['metadata']['fitting_method'] = fitting_method
            collected_products.append(product)

        min_price_products = {}
        for product in collected_products:
            key = "%s-%s-%s-%s-%s-%s-%s" % (
                product['brand'], product['name'],
                product['metadata']['fitting_method'],
                product['metadata']['full_tyre_size'],
                product['metadata']['xl'], product['metadata']['run_flat'],
                product['metadata']['manufacturer_mark'])
            if key in min_price_products:
                if product['price'] < min_price_products[key]['price']:
                    min_price_products[key] = product
            else:
                min_price_products[key] = product

        for product in min_price_products.values():
            yield product

示例#38

0

显示文件

文件： cctvSpider.py 项目： lingandcs/AShareScraper

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
#         titles = hxs.select("//ul[@class='title2 fs_14']")
#         testElem = hxs.select("/html/body/div[2]/div/div/div/div/div[2]/div[1]/div/div[1]/div[1]/div[1]/div[3]/div[1]/div[1]/ul[1]")
#         print response.url
        webStr = hxs.extract()
#         print webStr
        newsLinks = getLinks_before20110406(webStr)
        
        items = []

        for newsLink in newsLinks:
#             print li.extract()
            title = newsLink[0]
            link = newsLink[1]
            
#             newsBody = getNewsBody(link)
            newsBody = getNewsBody_20100506_20101231(link)
            if len(newsBody) < 100:
                print '------------------------'
                print title
                print link
                print newsBody
                print 'problem!!!!!!!!!!!!!!!!!!!!'
        
            item = CctvScraperItem()
#             item["date"] = '20130716'
#             item["date"] = response.url[-14:-6]
            item["date"] = getDTFromUrl(response.url)
            item ["title"] = title.encode('utf8')
            item ["link"] = link.encode('utf8')
            item["newsBody"] = newsBody
            
            items.append(item)
            self.csvWriter.writerow([item['date'], item['title'], item["newsBody"]])
#             print title
#             print link
#             print newsBody
            

#     def parse(self, response):
#         hxs = HtmlXPathSelector(response)
#         items = []
#         sections1 = hxs.select("/html")
#         for li in sections1.select('//li'):
#             title = ''.join(li.select('a/text()').extract())
#             if u"[视频]" not in title:
#                 continue
#             link = li.select('a/@href').extract()[0]
#             newsBody = getNewsBody(link)            
#             item = CctvScraperItem()
# #             item["date"] = '20130716'
#             item["date"] = response.url[-14:-6]
#             item ["title"] = title.encode('utf8')
#             item ["link"] = link.encode('utf8')
#             item["newsBody"] = newsBody            
#             items.append(item)
#             self.csvWriter.writerow([item['date'], item['title'], item["newsBody"]])
#             print title
#             print link
                
                
                
                
#         sections1 = hxs.select("/html/body/div[2]/div/div/div/div/div[2]/div[1]/div/div[1]/div[1]/div[1]/div[3]/div[1]/div[1]/ul[1]")
#         sections2 = hxs.select("/html/body/div[2]/div/div/div/div/div[2]/div[1]/div/div[1]/div[1]/div[1]/div[3]/div[1]/div[1]/ul[2]")
#         sections3 = hxs.select("/html/body/div[2]/div/div/div/div/div[2]/div[1]/div/div[1]/div[1]/div[1]/div[3]/div[1]/div[1]/ul[3]")
# 
#         items = []
#         for sections in [sections1, sections2, sections3]:
#             for li in sections.select('.//li'):
# #                 print li.extract()
#                 title = ''.join(li.select('a/text()').extract())
#                 link = li.select('a/@href').extract()[0]
#                 item = CctvScraperItem()
#                 item ["title"] = title
#                 item ["link"] = link
#                 items.append(item)
#                 print title 
#                 print link
#         return items

示例#39

0

显示文件

    def parse_product(self, response):
        if not isinstance(response, HtmlResponse):
            return

        hxs = HtmlXPathSelector(response)
        try:
            name = hxs.select(
                '//div[@class="product-name"]/h1/text()').extract()[0]
            url = response.url

            price = hxs.select(
                '//div[@class="product-view nested-container"]'
                '//div[@class="price-box"]/span/span[@class="price"]/text()'
            ).extract()
            if not price:
                price = hxs.select(
                    '//div[@class="product-view nested-container"]'
                    '//div[@class="price-box"]/p[@class="special-price"]'
                    '/span[@class="price"]/text()').extract()

            sku = hxs.select('//tr[th/text()="SKU"]/td/text()').extract()[0]
            brand = hxs.select(
                '//tr[th/text()="Manufacturer"]/td/text()').extract()[0]
            if price:
                price = extract_price(price[0])
            else:
                price = 0

            image_url = hxs.select('//a[@id="zoom1"]/img/@src').extract()
            if image_url:
                image_url = image_url[0]
            else:
                image_url = ''

            breadcrumb = hxs.select(
                '//div[@class="grid-full breadcrumbs"]/ul/li/a/text()'
            ).extract()
            category = breadcrumb[-1]
            if "ESSENTIAL" in ''.join(breadcrumb).upper():
                opts = []
                for line in hxs.extract().split('\n'):
                    if '"options":[' in line:
                        opts = json.loads(
                            line.split('"options":')[-1].split('}}')[0])

                if opts:
                    for opt in opts:
                        log.msg('CRAWL PRODUCT OPTIONS')
                        option_name = name + " - " + opt.get('label')
                        option_price = price + extract_price(opt.get('price'))
                        loader = ProductLoader(item=Product(), selector=hxs)
                        loader.add_value('url', url)
                        loader.add_value('name', option_name)
                        loader.add_value('price', option_price)
                        loader.add_value('sku', sku)
                        loader.add_value('brand', brand)
                        loader.add_value('image_url', image_url)
                        loader.add_value('identifier',
                                         sku + '-' + opt.get('label'))
                        loader.add_value('category', category)
                        stock = hxs.select(
                            '//p[@class="availability in-stock"]').extract()
                        if not stock:
                            loader.add_value('stock', 0)
                        yield loader.load_item()
                else:
                    loader = ProductLoader(item=Product(), selector=hxs)
                    loader.add_value('url', url)
                    loader.add_value('name', name)
                    loader.add_value('price', price)
                    loader.add_value('sku', sku)
                    loader.add_value('brand', brand)
                    loader.add_value('image_url', image_url)
                    identifier = hxs.select(
                        '//input[@name="product"]/@value').extract()[0]
                    loader.add_value('identifier', identifier)
                    loader.add_value('category', category)
                    stock = hxs.select(
                        '//p[@class="availability in-stock"]').extract()
                    if not stock:
                        loader.add_value('stock', 0)
                    yield loader.load_item()
            else:
                loader = ProductLoader(item=Product(), selector=hxs)
                loader.add_value('url', url)
                loader.add_value('name', name)
                loader.add_value('price', price)
                loader.add_value('sku', sku)
                loader.add_value('brand', brand)
                loader.add_value('image_url', image_url)
                identifier = hxs.select(
                    '//input[@name="product"]/@value').extract()[0]
                loader.add_value('identifier', identifier)
                loader.add_value('category', category)
                stock = hxs.select(
                    '//p[@class="availability in-stock"]').extract()
                if not stock:
                    loader.add_value('stock', 0)
                yield loader.load_item()
        except IndexError:
            return

示例#40

0

显示文件

    def parse_product(self, response):
        base_url = get_base_url(response)
        hxs = HtmlXPathSelector(response)

        meta = response.meta
        url = response.url
        price = ''
        for line in hxs.extract().split('\n'):
            if "MAIN:No^Refrnce" in line:
                price = line.split('");')[0].split(', "')[-1]

        if not price:
            try:
                price = hxs.select(
                    '//span[@itemprop="price"]/text()').extract()[0].replace(
                        ',', '')
            except:
                pass

        identifier = meta.get('identifier')
        if not identifier:
            identifier = hxs.select(
                '//form[@name="addItemToCart"]//input[@name="sku"]/@value'
            ).extract()[0]
        image_url = meta.get('image_url')
        if not image_url:
            image_url = hxs.select('//img[@id="mainImage"]/@src').extract()
        brand = meta.get('brand')
        if not brand:
            brand = hxs.select(
                '//div[@id="tMain"]//div[@class="mfrLogo"]//img[1]/@alt'
            ).extract()
        category = meta.get('category')
        if not category:
            try:
                category = hxs.select('//ul[@id="breadcrumbs"]/li/a/text()'
                                      ).extract()[-1].strip()
            except:
                pass
        sku = meta.get('sku')
        if not sku:
            sku = hxs.select(
                '//meta[@itemprop="productID" and contains(@content, "mpn:")]/@content'
            ).re(r'mpn:(\w+)')
            if sku:
                bushnell_product = self.bushnell_products.get(
                    sku[0].upper().strip(), None)
                if bushnell_product:
                    category = bushnell_product['Class']
                    log.msg(
                        'Extracts category "%s" from bushnell file, URL: %s' %
                        (category, response.url))

        name = meta.get('name')
        if not name:
            name = ''.join(
                hxs.select(
                    '//h1[@itemprop="name"]//text()').extract()).strip()

        if url not in self.urls_list:
            self.urls_list.append(url)
            loader = ProductLoader(item=Product(), selector=hxs)
            loader.add_value('identifier', identifier)
            loader.add_value('image_url', image_url)
            loader.add_value('brand', brand)
            loader.add_value('category', category)
            loader.add_value('url', url)
            loader.add_value('sku', sku)
            loader.add_value('name', name)
            loader.add_value('price', price)
            product = loader.load_item()
            yield self._get_reviews_url(product)

示例#41

0

显示文件

文件： scrapy_parser.py 项目： pavel-shpilev/html-parsers-benchmark

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     yield hxs.extract()
     yield Request(response.url, callback=self.parse, dont_filter=True)