Пример #1
0
    def test_add_or_replace_parameter(self):
        url = 'http://domain/test'
        self.assertEqual(add_or_replace_parameter(url, 'arg', 'v'),
                         'http://domain/test?arg=v')
        url = 'http://domain/test?arg1=v1&arg2=v2&arg3=v3'
        self.assertEqual(add_or_replace_parameter(url, 'arg4', 'v4'),
                         'http://domain/test?arg1=v1&arg2=v2&arg3=v3&arg4=v4')
        self.assertEqual(add_or_replace_parameter(url, 'arg3', 'nv3'),
                         'http://domain/test?arg1=v1&arg2=v2&arg3=nv3')

        url = 'http://domain/test?arg1=v1;arg2=v2'
        self.assertEqual(add_or_replace_parameter(url, 'arg1', 'v3'),
                         'http://domain/test?arg1=v3&arg2=v2')

        self.assertEqual(add_or_replace_parameter("http://domain/moreInfo.asp?prodID=", 'prodID', '20'),
                         'http://domain/moreInfo.asp?prodID=20')
        url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2%2C60&CatID=60'
        self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue'),
                         'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60')
        url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2,60&CatID=60'
        self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue'),
                         'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60')
        url = 'http://rmc-offers.co.uk/productlist.asp?'
        self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue'),
                         'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue')

        url = "http://example.com/?version=1&pageurl=http%3A%2F%2Fwww.example.com%2Ftest%2F%23fragment%3Dy&param2=value2"
        self.assertEqual(add_or_replace_parameter(url, 'version', '2'),
                         'http://example.com/?version=2&pageurl=http%3A%2F%2Fwww.example.com%2Ftest%2F%23fragment%3Dy&param2=value2')
        self.assertEqual(add_or_replace_parameter(url, 'pageurl', 'test'),
                         'http://example.com/?version=1&pageurl=test&param2=value2')
Пример #2
0
 def start_requests(self):
     for start_url in self.start_urls:
         u = url.add_or_replace_parameter(start_url[1], 'p', 1) # page
         for price in self.prices:
             u = url.add_or_replace_parameter(u, 'pr', ",".join([str(p) for p in price])) 
            
             yield Request(u, callback=self.parse_pages_json)
Пример #3
0
    def test_add_or_replace_parameter(self):
        url = 'http://domain/test'
        self.assertEqual(add_or_replace_parameter(url, 'arg', 'v'),
                         'http://domain/test?arg=v')
        url = 'http://domain/test?arg1=v1&arg2=v2&arg3=v3'
        self.assertEqual(add_or_replace_parameter(url, 'arg4', 'v4'),
                         'http://domain/test?arg1=v1&arg2=v2&arg3=v3&arg4=v4')
        self.assertEqual(add_or_replace_parameter(url, 'arg3', 'nv3'),
                         'http://domain/test?arg1=v1&arg2=v2&arg3=nv3')

        url = 'http://domain/test?arg1=v1;arg2=v2'
        self.assertEqual(add_or_replace_parameter(url, 'arg1', 'v3'),
                         'http://domain/test?arg1=v3&arg2=v2')

        self.assertEqual(add_or_replace_parameter("http://domain/moreInfo.asp?prodID=", 'prodID', '20'),
                         'http://domain/moreInfo.asp?prodID=20')
        url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2%2C60&CatID=60'
        self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue'),
                         'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60')
        url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2,60&CatID=60'
        self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue'),
                         'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60')
        url = 'http://rmc-offers.co.uk/productlist.asp?'
        self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue'),
                         'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue')

        url = "http://example.com/?version=1&pageurl=http%3A%2F%2Fwww.example.com%2Ftest%2F%23fragment%3Dy&param2=value2"
        self.assertEqual(add_or_replace_parameter(url, 'version', '2'),
                         'http://example.com/?version=2&pageurl=http%3A%2F%2Fwww.example.com%2Ftest%2F%23fragment%3Dy&param2=value2')
        self.assertEqual(add_or_replace_parameter(url, 'pageurl', 'test'),
                         'http://example.com/?version=1&pageurl=test&param2=value2')
    def start_requests(self):
        today_in_tz = datetime.now(timezone(self.tz))
        # d = timedelta(days=2)
        # today_in_tz += d

        # href = u"https://www.arb.ca.gov/aqmis2/display.php?year={year}&mon={month}&day={day}&param={param}&order=basin,county_name,name&county_name=--COUNTY--&basin=--AIR+BASIN--&latitude=--PART+OF+STATE--&o3switch=new&ptype=aqd&report=HVAL&statistic=HVAL&btnsubmit=Update+Display&units=007&hours=all"
        href = u"https://www.arb.ca.gov/aqmis2/display.php?year=x&mon=x&day=x&param=&order=basin,county_name,name&county_name=--COUNTY--&basin=--AIR+BASIN--&latitude=--PART+OF+STATE--&o3switch=new&ptype=aqd&report=HVAL&statistic=HVAL&btnsubmit=Update+Display&units=007&hours=all"
        params = ["BENZENE", "BC", "CO", "CO2", "COH", "H2S", "LTSC", "CH4", "NO2", "NO", "NOX",
                  "NOY", "NMHC", "OZONE_ppm", "SO2", "THC", "PMTEOM", "PMBAM", "PM10_LHR",
                  "PM10_SHR", "PM25HR"
                  ]
        # params = ["BENZENE", "BC", "CO", "CO2", "COH"]
        # params = ["OZONE", "CO", "SO2"]
        # params = ["COH", "H2S",]
        # params = ["SO2",]

        url = add_or_replace_parameter(href, "year", today_in_tz.year)
        url = add_or_replace_parameter(url, "mon", today_in_tz.month)
        url = add_or_replace_parameter(url, "day", today_in_tz.day)

        param = params.pop(0)

        yield RandomRequest(
            # url=href.format(param=params.pop(0)),
            url=add_or_replace_parameter(url, "param", param),
            # callback=self.collect_station_data,
            callback=self.check_validity,
            meta={
                "params": params,
                "data": list(),
                "changed_today_in_tz": None,
                "href": url,
                "param": param
            }
        )
Пример #5
0
 def get_api_url(api_url, api_key, target, **filters):
     '''Build the API URL to query a list of proxies'''
     api_url = urljoin(api_url, API_ENDPOINT)
     api_url = urljoin(api_url + '/', target)
     api_url = add_or_replace_parameter(api_url, 'api_key', api_key)
     for f_key, f_val in filters.items():
         api_url = add_or_replace_parameter(api_url, f_key, f_val)
     return api_url
Пример #6
0
    def parse_product(self, response):
        base_sku = response.xpath('//@data-ref').extract_first()
        identifier = re.search('p(\d+)$',
                               url_query_cleaner(response.url)).group(1)
        url = 'https://www.andrewjamesworldwide.com/ajax/get_product_options/{0}'.format(
            identifier)
        data = json.load(urlopen(url))
        attributes = [attr['values'] for attr in data['attributes']]
        if [] in attributes:
            url = add_or_replace_parameter(url, 'attributes[1]',
                                           attributes[0][0]['value_id'])
            data = json.load(urlopen(url))
            attributes = [attr['values'] for attr in data['attributes']]
        variants = itertools.product(*attributes)
        for variant in variants:
            url = 'https://www.andrewjamesworldwide.com/ajax/get_product_options/{0}'.format(
                identifier)
            for idx, option in enumerate(variant):
                url = add_or_replace_parameter(
                    url, 'attributes[{0}]'.format(idx + 1), option['value_id'])
            data = json.load(urlopen(url))
            selection = data['selection'].values()[0]
            sku = selection['reference'].strip()
            if not sku and base_sku not in self.skus_found:
                sku = base_sku
            if sku not in self.skus.keys():
                continue
            if sku in self.skus_found:
                self.logger.info('Duplicated SKU is found: %s' % sku)
            self.skus_found.add(sku)

            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('sku', sku)
            loader.add_value('identifier', selection['product_id'])
            loader.add_xpath('name', '//span[@id="js-product-title"]/text()')
            loader.add_value('name', [option['value'] for option in variant])
            loader.replace_value('name', selection['title'])
            loader.add_value('url', response.url)
            loader.add_value('price', selection['price_inc'])
            category = response.css('div.breadcrumb a::attr(title)').extract()
            loader.add_value('category', category[1:])
            try:
                image_url = [
                    attr['images'][0]['image']
                    for attr in data['attributes'][-1]['values']
                ]
            except IndexError:
                image_url = response.xpath(
                    '//div[@id="js-product-image"]//@src').extract()
            loader.add_value('image_url', response.urljoin(image_url[0]))
            loader.add_value('brand', "Andrew James")
            item = loader.load_item()

            metadata = AndrewJamesMeta()
            metadata['asin'] = self.skus[sku]['ASIN']
            item['metadata'] = metadata
            yield item
Пример #7
0
    def parse(self, response):
        # Main categories
        for cat_url in response.xpath(
                '//ul[@id="main-nav"]/li/a/@href').extract():
            yield Request(response.urljoin(cat_url))

        sub_categories = response.xpath(
            '//div[contains(@class, "sub-categories")]'
            '/div/div//p/a/@href').extract()
        for sub_cat in sub_categories:
            yield Request(
                add_or_replace_parameter(response.urljoin(sub_cat), 'sort',
                                         'lowest'))

        categories = response.xpath(
            '//ul[@class="category"]/li/a/@href').extract()
        categories += response.xpath(
            '//a[contains(@class, "shop-all-button")]/@href').extract()
        categories += response.css('.subcat-panel ::attr(href)').extract()
        for url in categories:
            yield Request(
                add_or_replace_parameter(response.urljoin(url), 'sort',
                                         'lowest'))

        next_page = response.xpath(
            '//ul[@class="pagination"]/li/a[@class="next"]/@href').extract()
        if next_page:
            yield Request(url=response.urljoin(next_page[0]))

        products = response.xpath('//div[contains(@class, "product")]')
        for product_xs in products:
            url = product_xs.xpath('a/@href').extract()
            if not url:
                continue
            product_loader = ProductLoader(item=Product(), selector=product_xs)
            product_loader.add_value('url', url)

            try:
                sku = product_xs.xpath('p[@class="product-sku"]/text()').re(
                    'KaTom #: (.*)')[0]
            except:
                sku = None
            product_loader.add_value('sku', sku)
            product_loader.add_value('identifier', sku)
            product_loader.add_xpath('name', 'a/@title')
            product_loader.add_css('image_url', '.img ::attr(src)')
            product_loader.add_xpath('category', '//h1[@class="title"]/text()')

            product = product_loader.load_item()
            if len(product.get('sku', '').split('-')) > 1:
                product['sku'] = '-'.join(product['sku'].split('-')[1:])

            yield Request(url=product_loader.get_output_value('url'),
                          meta={"product": product},
                          callback=self.parse_product)
Пример #8
0
    def parse_category(self, response):
        products = response.css('div.product')
        for product_xs in products:
            try:
                product_name = product_xs.xpath('./a/@title').extract()[0]
            except IndexError:
                continue
            product_url = response.urljoin(
                product_xs.xpath('./a/@href').extract()[0])
            if product_url not in self.seen:
                yield Request(product_url,
                              self.parse_product,
                              dont_filter=True)
                self.seen.add(product_url)
            continue
            product_identifier = re.findall(r'/p/(.+?)/', product_url)[0]
            product_price = product_xs.xpath(
                './/span[@itemprop="price"]/text()').re(r'[\d\,.]+')[0]
            product_stock = product_xs.css('div.stockinfo::text').re(
                r'[\d\,.]+')
            product_image = product_xs.xpath('.//img[@alt]/@src').extract()

            loader = ProductLoader(item=Product(), selector=product_xs)
            loader.add_value('identifier', product_identifier)
            loader.add_value('sku', product_identifier)
            loader.add_value('name', product_name)
            loader.add_value('url', product_url)
            loader.add_value('price', product_price)
            if product_stock:
                loader.add_value('stock', int(product_stock[0]))
            if product_image:
                loader.add_value('image_url',
                                 response.urljoin(product_image[0]))
            loader.add_value('category', response.meta['category'])

            yield loader.load_item()
        return

        pages = set(
            response.xpath('//ul[@id="pagelist1"]/li/a/text()').extract())
        next_page = response.meta['page'] + 1
        if str(next_page) in pages:
            url = add_or_replace_parameter(response.url, 'p', next_page)
            url = add_or_replace_parameter(url, 'q',
                                           response.meta['object_id'])
            yield Request(url,
                          callback=self.parse_category,
                          meta={
                              'category': response.meta['category'],
                              'object_id': response.meta['object_id'],
                              'page': next_page
                          })
    def parse(self, response):
        data = json.loads(str(response.body, 'utf-8'))
        for item in data["list"]:
            id = item["userUrl"]
            if id not in self.profiles:
                self.profiles.append(id)
                user_url = "https://www.kaggle.com" + id
                yield scrapy.Request(user_url, self.parseLocation)

        self.page += 1
        url = add_or_replace_parameter(response.url, 'page', self.page)
        url = add_or_replace_parameter(url, 'pageSize', self.pageSize)
        yield scrapy.Request(url, self.parse)
Пример #10
0
 def test_add_or_replace_parameter(self):
     url = 'http://domain/test'
     self.assertEqual(add_or_replace_parameter(url, 'arg', 'v'),
                      'http://domain/test?arg=v')
     url = 'http://domain/test?arg1=v1&arg2=v2&arg3=v3'
     self.assertEqual(add_or_replace_parameter(url, 'arg4', 'v4'),
                      'http://domain/test?arg1=v1&arg2=v2&arg3=v3&arg4=v4')
     self.assertEqual(add_or_replace_parameter(url, 'arg3', 'nv3'),
                      'http://domain/test?arg1=v1&arg2=v2&arg3=nv3')
     url = 'http://domain/test?arg1=v1'
     self.assertEqual(add_or_replace_parameter(url, 'arg2', 'v2', sep=';'),
                      'http://domain/test?arg1=v1;arg2=v2')
     self.assertEqual(
         add_or_replace_parameter("http://domain/moreInfo.asp?prodID=",
                                  'prodID', '20'),
         'http://domain/moreInfo.asp?prodID=20')
     url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2%2C60&CatID=60'
     self.assertEqual(
         add_or_replace_parameter(url,
                                  'BCat',
                                  'newvalue',
                                  url_is_quoted=True),
         'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60')
     url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2,60&CatID=60'
     self.assertEqual(
         add_or_replace_parameter(url, 'BCat', 'newvalue'),
         'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60')
     url = 'http://rmc-offers.co.uk/productlist.asp?'
     self.assertEqual(
         add_or_replace_parameter(url, 'BCat', 'newvalue'),
         'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue')
Пример #11
0
    def start_requests(self):
        # for id
        # codes = (u"0", u"1", u"2", u"3", u"4", u"5", u"7", u"8", u"9", u"10", u"11")

        codes = (u"56", u"42", u"50", u"60", u"61", u"62", u"63", u"53",
                 u"134", u"80", u"24", u"25", u"21", u"49", u"46", u"44",
                 u"45", u"28", u"40", u"1", u"3", u"5", u"6", u"8", u"13",
                 u"77", u"75", u"73", u"106", u"79", u"10", u"39", u"38",
                 u"15", u"19", u"18", u"57", u"30", u"36", u"35", u"34", u"33")
        # codes = (u"40",)

        # for id
        # href = u"http://www.nyaqinow.net/DynamicTable.aspx?"
        # for location
        # href = u"http://www.nyaqinow.net/StationDetails.aspx?"
        # for datascrapy crawl
        href = u"http://www.nyaqinow.net/StationInfo.aspx?"
        for code_value in codes:
            # for id
            # url = add_or_replace_parameter(href, u"G_ID", code_value)
            # for location
            url = add_or_replace_parameter(href, u"ST_ID", code_value)
            # for data
            # url = add_or_replace_parameter(href, u"ST_ID", code_value)

            yield Request(url=url,
                          callback=self.parse,
                          meta={u"code": code_value})
Пример #12
0
    def parse_catall(self, response):
        error = False
        try:
            html = response.body.split('@@ebusiness@@')[1]
        except:
            error = True
        if error:
            req = response.request
            meta = response.meta
            retries = meta.get('retries', 0)
            if retries < 3:
                meta['retries'] = retries + 1
                self.log('Retrying {}, attempt: {}'.format(
                    req.url, retries + 1))
                yield req.replace(dont_filter=True,
                                  callback=self.parse_catall,
                                  meta=meta)
            return
        hxs = Selector(text=html)
        for prod in hxs.xpath(
                '//td[@valign="middle" or @valign="top"]//a/@href').extract():
            yield Request(response.urljoin(prod), callback=self.parse_product)

        pagination = response.body.split('@@ebusiness@@')[0]
        if not pagination:
            return
        pages = re.findall(r"changePage\('(.+?)',", pagination)
        for page in pages:
            url = add_or_replace_parameter(response.url, 'p', page)
            yield Request(url, self.parse_catall)
Пример #13
0
    def run_crawl_all(self, response):
        print(' --- run_crawl_all --- ')
        t = datetime.datetime.now().strftime("%Y.%m.%d-%H:%M:%S")
        next_offset = int(url_query_parameter(response.url, 'offset')) + 10

        list_parse_res = list_parse(eval(response.body.decode()))
        list_db_data = list_into_dbdata(list_parse_res,
                                        self.task['task_biz_enname'],
                                        self.task['task_biz_chname'],
                                        self.task['_id'])

        # 到头了或者出错了
        if not list_db_data:
            self.task['task_status'] = 'end_success'
            print('要出去了')
        else:
            res = mongo_instance.loads.insert_many(list_db_data)
            if self.crawled_times == 1:
                print(' 插入的第一个id是: %s' % res.inserted_ids[0])
                self.task['task_start_loadid'] = res.inserted_ids[0]
            self.crawled_times += 1
            print('还有请求呢别着急出去')

        self.task['task_updatetime'] = t
        self.task['task_endtime'] = t
        mongo_instance.tasks.find_one_and_update(
            filter={'_id': self.task['_id']}, update={'$set': self.task})

        if not 'running' in self.task['task_status']:
            return
        else:
            yield scrapy.Request(url=add_or_replace_parameter(
                response.url, 'offset', next_offset),
                                 headers=FakeLoadParams.headers,
                                 method='GET')
    def start_requests(self):
        brands = {
            'USN': ['http://www.predatornutrition.com/shop-by-brand/usn'],
            'Optimum Nutrition': [
                'http://www.predatornutrition.com/shop-by-brand/optimum-nutrition'
            ],
            'BSN': ['http://www.predatornutrition.com/shop-by-brand/bsn'],
            'PhD':
            ['http://www.predatornutrition.com/shop-by-brand/phd-nutrition'],
            'Maxi Nutrition':
            ['http://www.predatornutrition.com/shop-by-brand/maxinutrition'],
            'Reflex':
            ['http://www.predatornutrition.com/shop-by-brand/reflex'],
            'Mutant':
            ['http://www.predatornutrition.com/shop-by-brand/mutant'],
            'Cellucor':
            ['http://www.predatornutrition.com/shop-by-brand/cellucor'],
            'Sci-MX':
            ['http://www.predatornutrition.com/shop-by-brand/sci-mx']
        }

        cookies = {
            'GlobalE_Data': {
                'countryISO': 'GB',
                'cultureCode': 'en',
                'currencyCode': 'GBP'
            }
        }

        for brand_name, urls in brands.iteritems():
            for url in urls:
                link = add_or_replace_parameter(url, 'viewAll', 'true')
                yield Request(link,
                              meta={'brand': brand_name},
                              cookies=cookies)
Пример #15
0
    def start_requests(self):
        codes = (u"AY1", u"BAR6", u"BAR9", u"HB010", u"HB011", u"BAI2", u"WIL1", u"BUR2", u"BUR1", u"WIL8", u"WIL5",
                 u"NEW2", u"CAM3", u"CAM5", u"CAM4", u"CAM1", u"CRL2", u"HB013", u"WIL3", u"HB012", u"EWE2", u"FAR2",
                 u"GA1", u"GA2", u"GA3", u"GIRT", u"FAR1", u"T55", u"LHR2", u"T54", u"HEN", u"HB008", u"HB009", u"HI1",
                 u"SIPS", u"HB002", u"HB003", u"HS5", u"HS4", u"HS2", u"HS9", u"HS8", u"HS7", u"HS6", u"BN2", u"HIL1",
                 u"HIL4", u"HIL5", u"HI3", u"HB006", u"HB007", u"MAN1", u"MAN7", u"MAHG", u"WIL7", u"NUL1", u"OX6",
                 u"OX3", u"REA2", u"REA4", u"RED3", u"IMP", u"ORCH", u"M60", u"WIL4", u"CW", u"SLH7", u"SLH3", u"SLH6",
                 u"SLH5", u"SLH8", u"SLH9", u"SLH4", u"GX", u"SHOL", u"MONK", u"HB005", u"STK7", u"STK5", u"SUN2",
                 u"SUN4", u"BN1", u"TAM1", u"TAME", u"GOS1", u"TRAF", u"TRF2", u"WD1", u"WL4", u"WL1", u"WL5", u"HB004",
                 u"WAT", u"HB001", u"WID2", u"WID1", u"WIG7", u"NEW3", u"WYA4", u"WSTO", u"YK10", u"YK11", u"YK16",
                 u"YK7", u"YK13", u"YK8", u"YK9", u"YK15", u"YK018", u"BAR3", u"BPLE", u"BATH", u"BIL", u"BBRD",
                 u"BIRR", u"AGRN", u"BIR1", u"BLAR", u"BLC2", u"BORN", u"BDMA", u"BRT3", u"BRS8", u"BURW", u"CAM",
                 u"CANK", u"CANT", u"CARL", u"MACK", u"CHAT", u"CHLG", u"CHS7", u"CHBO", u"CHBR", u"COAL", u"DCST",
                 u"EB", u"EX", u"GLAZ", u"HM", u"HONI", u"HUL2", u"HULR", u"LB", u"LEAM", u"LEAR", u"LEED", u"LED6",
                 u"LEIR", u"LECU", u"LEOM", u"LIN3", u"LVP", u"LH", u"LUTR", u"MAN3", u"MKTH", u"MID", u"NEWC", u"NCA3",
                 u"NTN3", u"NO12", u"NOTT", u"NWBV", u"BOLD", u"OX", u"OX8", u"PLYM", u"PMTH", u"PRES", u"REA5",
                 u"ROCH", u"ECCL", u"SASH", u"SDY", u"SCN2", u"SHBR", u"SHDG", u"SHE", u"SIB", u"SA33", u"SOUT",
                 u"SEND", u"SHLW", u"OSY", u"SOTR", u"EAGL", u"STKR", u"STOK", u"STOR", u"SUNR", u"WAL4", u"WAR",
                 u"WEYB", u"WFEN", u"WSMR", u"WIG5", u"TRAN", u"WTHG", u"YW")

        # codes = (u"LEIR",)
        url = u"http://www.airqualityengland.co.uk/site/latest"
        for code_value in codes:
            url = add_or_replace_parameter(url, u"site_id", code_value)

            yield Request(
                url=url,
                callback=self.parse,
                meta={u"code": code_value}
            )
Пример #16
0
    def parse_listing(self, response):
        """
        Extract product list.
        
        @url https://www.walgreens.com/store/c/eyes/ID=360457-tier3
        @returns requests 1
        """
        blob = response.css('script').re_first(
            r'__APP_INITIAL_STATE__ = (\{.+\});')
        if not blob:
            return

        data = json.loads(blob)

        if not data['searchResult'].get('productList'):
            return

        for each in data['searchResult']['productList']:
            yield response.follow(each['productInfo']['productURL'],
                                  callback=self.parse_product)

        limit = response.meta.get('limit', 24)
        offset = int(url_query_parameter(response.url, 'No', 0)) + limit

        return response.follow(add_or_replace_parameter(
            response.url, 'No', offset),
                               callback=self.parse_listing,
                               meta={
                                   'offset': offset,
                                   'limit': limit
                               })
Пример #17
0
    def parse(self, response):
        data = json.loads(str(response.body, 'utf-8'))
        for item in data:
            finalData = {
                "language": item["languageName"],
                "comments": item["totalComments"],
                "votes": item["totalVotes"],
                "medal": item["medal"],
                "id": item["id"],
                "date": item["scriptVersionDateCreated"]
            }
            id = item["id"]
            yield finalData

        if id not in self.ids:
            self.ids.append(id)
        else:
            logging.info("The id is duplicate, stop here")
            return

        if data[len(data) - 1]["id"]:
            self.page += 20
            if self.page > 1000:
                self.page = 1000
            url = add_or_replace_parameter(response.url, 'after',
                                           data[len(data) - 1]["id"])
            url = re.sub(r"([0-9]){1,9}(?=\?)", str(self.page), url)
            yield scrapy.Request(url, self.parse)
Пример #18
0
    def parse_category(self, response):
        try:
            data = SpiderSchema(response).get_products()
        except:
            return
        products = False
        for product in data:
            if not product.get('sku'):
                continue
            products = True
            loader = ProductLoader(Product(), response=response)
            loader.add_value('identifier', product['sku'])
            loader.add_value('url', product['url'][0])
            loader.add_value('name', product['name'])
            loader.add_value('sku', product['sku'])
            category = response.css('a.GTM-breadcumb::text').extract(
            )[1:] or response.meta.get('category')
            loader.add_value('category', category)
            loader.add_value('image_url', product['image'])
            loader.add_value('brand', product['brand'])
            if product['offers']['properties']['availability'] != 'in stock':
                loader.add_value('stock', 0)
            price = product['offers']['properties']['price']
            yield Request(loader.get_output_value('url'),
                          self.parse_product,
                          meta={'item': Product(loader.load_item())})
        if not products:
            return

        page = url_query_parameter(response.url, 'page')
        if page:
            url = add_or_replace_parameter(response.url, 'page', int(page) + 1)
        else:
            id_families = response.xpath(
                '//input[@data-key="idFamilies"]/@value').extract_first()
            if id_families:
                url = add_or_replace_parameter(
                    'https://www.pccomponentes.pt/listado/ajax?page=0&order=price-desc',
                    'idFamilies[]', id_families)
            elif response.url.endswith('/novedades/'):
                return
            elif response.url.endswith('/'):
                url = response.url + 'ajax?page=0&order=price-desc'
            else:
                return

        yield Request(url, self.parse_category, meta={'category': category})
Пример #19
0
    def new_request(self, resp):
        date_url = self.url_to_datetime(resp)

        # decrease day
        previous = time_to_dict(date_url, 1)

        new_url = add_or_replace_parameter(resp.url, u"fecha_dia",
                                           previous[u"day"])
        new_url = add_or_replace_parameter(new_url, u"fecha_mes",
                                           previous[u"month"])
        new_url = add_or_replace_parameter(new_url, u"fecha_anio",
                                           previous[u"year"])
        return Request(
            url=new_url,
            callback=self.check_validity,
            # dont_filter=True
        )
Пример #20
0
    def parse_pages_json(self, response):
        # get count of pages
        j_response = json.loads(response.body_as_unicode())
        page_count = int(j_response["list"]["numPages"])

        # open pages
        for page in xrange(page_count - 1):
            yield Request(url.add_or_replace_parameter(response.url, 'p', page), callback=self.parse_page_json)
Пример #21
0
    def parse(self, response):
        links = response.css('[data-hook=product-list-grid-item] a::attr(href)').getall()
        for l in links:
            yield Request(l, callback=self.parse_item)

        if response.css('[data-hook=load-more-button]'):
            page_no = response.meta.get('page', 1) + 1
            next_page_url = add_or_replace_parameter(response.url, 'page', page_no + 1)
            yield Request(next_page_url, callback=self.parse, meta={'page': page_no})
Пример #22
0
    def extract_links(self, response):
        page_no = url_query_parameter(response.url, 'page_no', None)
        if not response.css('.Result a'):
            return []

        return [
            Link(url=add_or_replace_parameter(response.url, 'page_no',
                                              int(page_no) + 1))
        ]
Пример #23
0
    def parse_product_list(self, response):
        hxs = HtmlXPathSelector(response)

        categories = hxs.select('//li[@class="PANEL ALL"]//a/@href').extract()
        categories += hxs.select(
            '//li[@class="PANEL BY-SIZE"]//a/@href').extract()
        categories += hxs.select(
            '//li[@class="PANEL BY-TYPE"]//a/@href').extract()
        for url in categories:
            url = url_query_cleaner(response.urljoin(url))
            yield Request(url, callback=self.parse_product_list)

        products = hxs.select('//div[@id="pdList"]//a/@href').extract()
        products += hxs.select(
            '//div[@class="product-tile"]//a/@href').extract()
        for url in products:
            pid = url.split('_')[-1]
            if pid not in self.parsed_products:
                self.parsed_products.append(pid)
                url = url_query_cleaner(response.urljoin(url))
                yield Request(url, callback=self.parse_product)

        product_variants = hxs.select(
            '//div[@class="productVariantTypeOptions"]/a/@href').extract()
        for url in product_variants:
            self.log('productVariantTypeOptions! {}'.format(url))
            pid = url.split('_')[-1]
            if pid not in self.parsed_products:
                self.parsed_products.append(pid)
                url = url_query_cleaner(response.urljoin(url))
                yield Request(url, callback=self.parse_product)

        next_page = None
        cur_page = url_query_parameter(response.url, 'pi', None)
        if cur_page:
            # The spider is already crawling the pages, we just assing the current url
            # so we can increment the 'pi' argument
            next_page = response.url
        else:
            # First page of the product list, we extract the pagination url with regex
            next_page = re.findall('.get\( &quot;(.*)pi=', response.body)
            if next_page:
                next_page = response.urljoin(next_page[0])

        if (next_page and products != response.meta.get('products', [])) or (
                next_page and
                product_variants != response.meta.get('product_variants', [])):
            cur_page = url_query_parameter(next_page, 'pi', '1')
            url = add_or_replace_parameter(next_page, 'pi',
                                           str(int(cur_page) + 1))
            self.log('Goes to next page: ' + url)
            yield Request(url,
                          callback=self.parse_product_list,
                          meta={
                              'products': products,
                              'product_variants': product_variants
                          })
Пример #24
0
 def start_requests(self):
     codes = (u"17", u"3", u"10", u"2", u"9", u"4", u"5", u"8")
     # codes = (u"17",)
     url = u"https://novascotia.ca/nse/airdata/StationInfo3.aspx?"
     for code_value in codes:
         url = add_or_replace_parameter(url, u"ST_ID", code_value)
         yield Request(url=url,
                       callback=self.parse,
                       meta={u"code": code_value})
Пример #25
0
 def parse_pages(self, response):
     html = json.loads(response.body)
     selector = Selector(text=html['html'])
     for url in selector.xpath(
             '//@href[contains(., "/produkt/")]').extract():
         yield Request(url, self.parse_product)
     total_sets = int(selector.css('.totalSets::text').extract_first())
     for s in xrange(total_sets):
         url = add_or_replace_parameter(response.url, 'set', s + 1)
         yield Request(url, self.parse_pages)
Пример #26
0
 def parse_category(self, response):
     try:
         category_id = response.xpath('//script/text()').re(
             "categoryID: *'(.+)'")[0]
     except IndexError:
         return
     per_page = response.xpath('//script/text()').re(
         "var showInput *= *'(.+)'")[0]
     sort = response.xpath('//script/text()').re(
         "var sortInput *= *'(.+)'")[0]
     url = 'http://www.bmstores.co.uk/hpcProduct/productbyfilter/ajaxmode/1'
     parameters = ('categoryID', 'perPage', 'sort')
     values = (category_id, per_page, sort)
     for parameter, value in zip(parameters, values):
         url = add_or_replace_parameter(url, parameter, value)
     pages = response.xpath('//@data-pageto').extract()
     for page in pages:
         yield Request(add_or_replace_parameter(url, 'pageNum', page),
                       self.parse_json_products)
Пример #27
0
    def start_requests_id(self):
        codes_id = (u"22", u"23", u"64", u"75", u"80", u"81", u"90", u"109",
                    u"110", u"118")
        # codes_id = (u"22",)

        href = u"https://fortress.wa.gov/ecy/enviwa/DynamicTable.aspx?"
        for code_id_value in codes_id:
            url = add_or_replace_parameter(href, u"G_ID", code_id_value)

            yield Request(url=url, callback=self.parse)
Пример #28
0
    def get_matched_products(self, website_id):
        api_url = urljoin_rfc(self.host,
                              '/api/get_matched_products_paged.json')
        api_url = add_or_replace_parameter(api_url, 'website_id',
                                           str(website_id))
        api_url = add_or_replace_parameter(api_url, 'api_key', self.api_key)

        page = 0
        count = 1000
        continue_next_page = True
        matched_products = []

        while continue_next_page:
            api_url = add_or_replace_parameter(api_url, 'start',
                                               str(page * count))
            api_url = add_or_replace_parameter(api_url, 'count', str(count))

            try:
                try_no = 1
                try_query = True
                while try_query:
                    try:
                        r = requests.get(api_url)
                        data = r.json()
                        new_matches = data.get('matches', [])
                    except Exception, e:
                        if not (try_no <= 10 and self.retry):
                            raise e
                        else:
                            try_no += 1
                            time.sleep(1)
                    else:
                        try_query = False
            except Exception:
                continue_next_page = False
            else:
                matched_products.extend(new_matches)
                if len(new_matches) < count:
                    continue_next_page = False
                else:
                    page += 1

        return matched_products
Пример #29
0
class RebelSport(CrawlSpider):
    name = 'kitbag_au-rebelsport'
    allowed_domains = ['rebelsport.com.au']
    start_urls = [
        'http://www.rebelsport.com.au/store/fangear/soccer-football/604'
    ]

    categories = LinkExtractor(
        restrict_css='.secondary-menu',
        process_value=lambda url: add_or_replace_parameter(
            url, 'pageSize', '500'))
    pages = LinkExtractor(restrict_css='.pagination')
    products = LinkExtractor(
        restrict_css='.product',
        process_value=lambda url: make_variant_url(url_query_cleaner(url)))

    rules = (Rule(categories), Rule(products, callback='parse_product'))

    def parse_product(self, response):
        data = response.xpath('//script/text()').re('{\\\\"Variants.+}')[0]
        data = json.loads(data.replace('\\"', '"'))
        variants = data['Variants']
        for variant in variants:
            url = response.urljoin(variant['ProductPLU'])
            yield Request(make_variant_url(url), self.parse_product)

        loader = ProductLoader(item=Product(), response=response)
        identifier = response.xpath(
            '//input[@id="ProductPLU"]/@value').extract_first()
        loader.add_value('identifier', identifier)
        loader.add_value('sku', identifier)
        loader.add_value('url', response.url)
        loader.add_xpath('name', '(//h1[@itemprop="name"]/text())[1]')
        metadata = {}
        for i in xrange(3):
            variant_name = data['Variant%dSelected' % (i + 1)]
            if variant_name and variant_name != 'N/A':
                loader.add_value('name', variant_name)
                metadata[data['Variant%dHeader' % (i + 1)]] = variant_name
                if 'size' in variant_name.lower():
                    metadata['size'] = variant_name[5:].strip()
        price = response.css('.price-value .currency::text').extract()
        loader.add_value('price', price.pop())
        category = response.css('.breadcrumb a::text').extract()
        loader.add_value('category', category[1:])
        loader.add_css('image_url', '.product-image::attr(src)')
        loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content')
        loader.add_value('shipping_cost', '7.95')
        stock = response.css('.product-stock-widget::attr(ng-init)').re(
            'AvailableOnline: (\w+)')[0]
        if stock != 'true':
            loader.add_value('stock', 0)
        item = loader.load_item()
        item['metadata'] = metadata
        yield item
Пример #30
0
    def parse(self, response):
        # Get help from:  http://stackoverflow.com/questions/38574869/how-can-i-jump-to-next-page-in-scrapy
        if response.meta.get('is_json', False):
            page = Selector(text=json.loads(response.body)['table'])
        else:
            page = Selector(response) 

        if self.flag:
            self.total_item_num = int(page.xpath('//div[@id="show-more-courses"]/text()').re(r'courses of (.*)')[0]) + 50
            print "Total courses: ", self.total_item_num
            self.steps = self.total_item_num / 50 + 1
            self.flag = False

        base_urls = "https://www.class-central.com/courses/past"
        #base_urls = "https://www.class-central.com/courses/recentlyAdded"
        my_header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'}

        divs = page.xpath('//tr[@itemtype="http://schema.org/Event"]')
        #print "print content", len(divs) 
        print "Process: ", self.cnt, '/', self.steps

        for div in divs:
            item = MoocCrawlerItem()
            item = {k:"" for k in item.keys()}

            parse_name = div.xpath('./td/a/span[@class="course-name-text"]/text()').extract_first().strip()
            item['name'] = parse_name
            parse_score = div.xpath('./td/div[@class="course-rating-value"]/text()').extract_first().strip()
            if len(parse_score) > 3:
                parse_score = parse_score[:3]
            item['score'] = string.atof(parse_score) * 2
            parse_platform = div.xpath('./td/div[@class="course-provider"]/text()').extract_first().strip()
            item['platform'] = parse_platform
            parse_url = div.xpath('./td/a/@href').extract_first().decode().encode('utf-8').strip()
            item['url'] = "https://www.class-central.com" + parse_url
            parse_cid = re.findall(r'/mooc/(.*)/', parse_url)[0]
            item['cid'] = "cc" + parse_cid

            req = scrapy.Request(item['url'], headers=my_header, callback=self.parse_detail_page)
            req.meta['item'] = item   

            yield req
        
        #next_page_el = respones.xpath("//div[@id='show-more-courses']")

        if self.cnt < self.steps:
        #if next_page_el:
            next_page_url = "https://www.class-central.com/maestro/courses/past?page=1&_=1471346096733"
            #next_page_url = "https://www.class-central.com/maestro/courses/recentlyAdded?page=1"
            next_page = response.meta.get('page', 1) + 1
            next_page_url = add_or_replace_parameter(next_page_url, 'page', next_page)
            r = scrapy.Request(next_page_url, headers=my_header, callback=self.parse, meta={'page': next_page, 'is_json': True})
            self.cnt += 1
            yield r
Пример #31
0
    def parse(self, response):
        data = json.loads(response.body)
        total_results = data['totalDatasetListItems']
        page = 1
        # figure out how many pages are there and loop through them.
        for i in range(20, total_results,
                       20):  # step 20 since we have 20 results per page
            url = add_or_replace_parameter(response.url, 'page', page)
            yield scrapy.Request(url, self.parse_page)

        # don't forget to parse first page as well!
        yield from self.parse_page(self, response)
Пример #32
0
 def parse_page(self, response):
     data = json.loads(response.body)
     if not data['success']:
         self.logger.warning('Failed pagination %s' % response.url)
     selector = Selector(text=data['paginationLink'])
     for page in selector.css(
             'div.pagination ::attr(data-pageto)').extract():
         url = add_or_replace_parameter(response.url, 'pageNum', page)
         yield Request(url, self.parse_page)
     selector = Selector(text=data['pageHTML'])
     for url in selector.css('a.product::attr(href)').extract():
         yield Request(response.urljoin(url), self.parse_product)
Пример #33
0
    def start_requests(self):
        codes = (u"9", u"12", u"2", u"10", u"4", u"11", u"3", u"6", u"13",
                 u"7", u"8", u"5", u"15")
        # codes = (u"4",)

        url = u"http://envista.pima.gov/StationInfo1.aspx?"
        for code_value in codes:
            url = add_or_replace_parameter(url, u"ST_ID", code_value)

            yield Request(url=url,
                          callback=self.parse,
                          meta={u"code": code_value})
Пример #34
0
    def parse_hotel(self, response):
        hxs = Selector(response)
        hotel = HtmlParser.extract_hotel(response.url, hxs)

        checkin = url_query_parameter(response.url,"checkin")
        checkout = url_query_parameter(response.url,"checkout")

        checkinDatetime = None
        checkoutDatetime = None

        today = datetime.date.today()

        if checkin is not None:
            checkinDatetime = datetime.datetime.strptime(checkin, "%Y-%m-%d").date()
            checkinDatetime = self.add_months(checkinDatetime,1)
        else:
            checkinDatetime = datetime.date(today.year, today.month, 15)

        if checkout is not None:
            checkoutDatetime = datetime.datetime.strptime(checkout, "%Y-%m-%d").date()
            checkoutDatetime = self.add_months(checkoutDatetime,1)
        else:
            checkoutDatetime = datetime.date(today.year, today.month, 16)

        maxDatetime = self.add_months(today,18)

        if checkinDatetime < maxDatetime:
            url = url_query_cleaner(response.url)
            url = add_or_replace_parameter(url,"checkin",str(checkinDatetime))
            url = add_or_replace_parameter(url,"checkout",str(checkoutDatetime))
            #logging.warning('----------------------------  %s' % url)
            yield Request(url, callback=self.parse_hotel)

        yield hotel["hotel"]

        if len(hotel["rooms"]) > 0:
            for room in hotel["rooms"]:
                yield room
    def parse_first_page(self, response):
        total = int(response.xpath('//*[@id="voltron_srp_main-content"]/comment()').re(r'"formattedResultCount":"([\d,]+)"')[0].replace(",",""))
        page_count = total / self.PAGE_SIZE

        if page_count > self.MAX_PAGE_COUNT:
            page_count = self.MAX_PAGE_COUNT
        
        
        # parse first page
        for item in self.parse_page(response):
            yield item
        
        for i in xrange(page_count-1):
            u = url.add_or_replace_parameter(
                self.CONTACTS_URL, 'page_num', i+2)
            yield Request(u, callback=self.parse_page)
Пример #36
0
 def test_add_or_replace_parameter(self):
     url = 'http://domain/test'
     self.assertEqual(add_or_replace_parameter(url, 'arg', 'v'),
                      'http://domain/test?arg=v')
     url = 'http://domain/test?arg1=v1&arg2=v2&arg3=v3'
     self.assertEqual(add_or_replace_parameter(url, 'arg4', 'v4'),
                      'http://domain/test?arg1=v1&arg2=v2&arg3=v3&arg4=v4')
     self.assertEqual(add_or_replace_parameter(url, 'arg3', 'nv3'),
                      'http://domain/test?arg1=v1&arg2=v2&arg3=nv3')
     url = 'http://domain/test?arg1=v1'
     self.assertEqual(add_or_replace_parameter(url, 'arg2', 'v2', sep=';'),
                      'http://domain/test?arg1=v1;arg2=v2')
     self.assertEqual(add_or_replace_parameter("http://domain/moreInfo.asp?prodID=", 'prodID', '20'),
                      'http://domain/moreInfo.asp?prodID=20')
     url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2%2C60&CatID=60'
     self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue', url_is_quoted=True),
                      'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60')
     url = 'http://rmc-offers.co.uk/productlist.asp?BCat=2,60&CatID=60'
     self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue'),
                      'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue&CatID=60')
     url = 'http://rmc-offers.co.uk/productlist.asp?'
     self.assertEqual(add_or_replace_parameter(url, 'BCat', 'newvalue'),
                      'http://rmc-offers.co.uk/productlist.asp?BCat=newvalue')
Пример #37
0
    def parse(self, response):
        for restaurant in response.css('.shortSellDetails'):
            il = RestaurantItemLoader(selector=restaurant)
            il.add_css('name', '.property_title::text')
            il.add_css('url', '.property_title::attr(href)')
            il.add_css('cuisines', '.cuisine::text')
            item = il.load_item()
            yield scrapy.Request(
                item['url'],
                callback=self.parse_details,
                meta=dict(item=item)
            )

        pagination_url = (
            'https://www.tripadvisor.com/RestaurantSearch?Action=PAGE'
            '&geo=294079&ajax=1&sortOrder=popularity&o=a0'
            '&availSearchEnabled=false'
        )
        if not response.css('.nav.next.disabled'):
            offset = response.meta.get('offset', 0) + 30
            pagination_url = add_or_replace_parameter(
                pagination_url, 'o', 'a{}'.format(offset))
            yield scrapy.Request(pagination_url, meta={'offset': offset})
Пример #38
0
 def _process_url(self, url):
     return add_or_replace_parameter(
                 'http://localhost:8998/redirect-to',
                 'goto', url)
Пример #39
0
 def _process_url(self, url):
     return add_or_replace_parameter(
                 self.mockserver.url('/redirect-to'),
                 'goto', url)