def parse_product_list(self, response): hxs = HtmlXPathSelector(response) categories = hxs.select('//li[@class="PANEL ALL"]//a/@href').extract() categories += hxs.select( '//li[@class="PANEL BY-SIZE"]//a/@href').extract() categories += hxs.select( '//li[@class="PANEL BY-TYPE"]//a/@href').extract() for url in categories: url = url_query_cleaner(response.urljoin(url)) yield Request(url, callback=self.parse_product_list) products = hxs.select('//div[@id="pdList"]//a/@href').extract() products += hxs.select( '//div[@class="product-tile"]//a/@href').extract() for url in products: pid = url.split('_')[-1] if pid not in self.parsed_products: self.parsed_products.append(pid) url = url_query_cleaner(response.urljoin(url)) yield Request(url, callback=self.parse_product) product_variants = hxs.select( '//div[@class="productVariantTypeOptions"]/a/@href').extract() for url in product_variants: self.log('productVariantTypeOptions! {}'.format(url)) pid = url.split('_')[-1] if pid not in self.parsed_products: self.parsed_products.append(pid) url = url_query_cleaner(response.urljoin(url)) yield Request(url, callback=self.parse_product) next_page = None cur_page = url_query_parameter(response.url, 'pi', None) if cur_page: # The spider is already crawling the pages, we just assing the current url # so we can increment the 'pi' argument next_page = response.url else: # First page of the product list, we extract the pagination url with regex next_page = re.findall('.get\( "(.*)pi=', response.body) if next_page: next_page = response.urljoin(next_page[0]) if (next_page and products != response.meta.get('products', [])) or ( next_page and product_variants != response.meta.get('product_variants', [])): cur_page = url_query_parameter(next_page, 'pi', '1') url = add_or_replace_parameter(next_page, 'pi', str(int(cur_page) + 1)) self.log('Goes to next page: ' + url) yield Request(url, callback=self.parse_product_list, meta={ 'products': products, 'product_variants': product_variants })
def test_url_query_cleaner_keep_fragments(self): self.assertEqual( 'product.html?id=200#foo', url_query_cleaner("product.html?id=200&foo=bar&name=wired#foo", ['id'], keep_fragments=True)) self.assertEqual( 'product.html?id=200', url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], keep_fragments=True))
def _set_image_value(self, instance, image_value): """ Instance에 Image value를 :param instance: :param image_value: :return: """ try: decoded_data = self._parse_base64(image_base64=image_value) filename = self._generate_filename() image = self._process_image(data=decoded_data, max_size=600) instance.image.save( filename, image, save=False, ) url = url_query_cleaner(instance.image.url) instance.image_insert_value = {"image": f"{url}"} # image 가 base64가 아닌 경우 # url 주소일 경유 담겨있을 경우 image_insert_value에 url 추가 # image 가 base64도 아니고 link도 아닌 잘못된 형식일 경우 ValueError except AttributeError: if image_value[: 4] == "http" or image_value[: 6] == settings.MEDIA_URL: instance.image_insert_value = {"image": f"{image_value}"} else: raise ValueError( "올바른 형태의 이미지 Base64가 아닙니다. data:image/png;base64로 시작하는지 확인해주세요 " )
def parse_json(self, response): data = json.loads(response.body) selector = Selector(text=data['products']) for url in selector.xpath('//a/@href[contains(., ".prd")]').extract(): yield Request( url_query_cleaner(response.urljoin(url), ('skuId', )), self.parse_product)
def clean_url(url): if "youtube.com" in url or "youtu.be" in url: return url # u = url_normalize(url) u = url u = url_query_cleaner(u, parameterlist=[ 'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content' ], remove=True, keep_fragments=True) # if "cdn.discordapp.com" in u and ".gif" in u: # u = u.replace("cdn.discordapp.com", "media.discordapp.net") if len(u) == len(url): u = url # headers = {'User-Agent': 'Mozilla/5.0'} # response = requests.get(u, headers=headers) # if response.history: # u = response.url if "https://www.google.com/url?q=" in u: u = clean_url(u.replace("https://www.google.com/url?q=", "")) # ancre = re.search(r"\#\w*$", url) # if ancre is not None and ancre.group(0) not in u: # u = u + ancre.group(0) if u[-1:] == '#': return u[:-1] else: return u
def request_fingerprint(self, request): url = url_query_cleaner(request.url, ['snr'], remove=True) request = request.replace(url=url) return super().request_fingerprint(request)
class AsosItem(scrapy.Item): """Scrapy item to store scraped data from asos.com. Attributes: article_type (scrapy.Field): List of the associated article type, for example: ['women', 'shoes']. product_name (scrapy.Field): Str of the product name, for example: New Look Satin Twist Slider. product_url (scrapy.Field): Str of the url of the product. brand_name (scrapy.Field): Str of associated brand of the product, for example: New Look. price (scrapy.Field): String of the price of the product. fit (scrapy.Field): List of the different sizes for the product. colors (scrapy.Field): List of colors for the product. details_and_care_info (scrapy.Field): List of care information for the product. details_and_care_list (scrapy.Field): List of care and details information. image_urls (scrapy.Field): List of urls of the images. images (scrapy.Field): List of hashes for corresponding image_urls. spider_name (scrapy.Field): Str of spider name. """ article_type = scrapy.Field(output_processor=RemoveSaleHome()) product_name = scrapy.Field(output_processor=TakeFirst()) product_url = scrapy.Field(output_processor=TakeFirst()) brand_name = scrapy.Field(output_processor=TakeFirst()) price = scrapy.Field(output_processor=TakeFirst()) fit = scrapy.Field() colors = scrapy.Field(output_processor=TakeFirst()) details_and_care_info = scrapy.Field(output_processor=TakeFirst()) details_and_care_list = scrapy.Field(output_processor=TakeFirst()) image_urls = scrapy.Field( output_processor=MapCompose(lambda x: url_query_cleaner(x))) images = scrapy.Field() spider_name = scrapy.Field(output_processor=TakeFirst())
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.xpath( '//input[@name="productId"]/@value').extract_first() if not identifier: loader.add_value('stock', 0) identifier = response.xpath('//text()').re('productId=(.+?)&') loader.add_value('identifier', identifier) loader.add_value('url', url_query_cleaner(response.url)) loader.add_css('name', 'div.productTitleDescriptionContainer h1::text') loader.add_css('price', 'p.pricePerUnit::text') loader.add_css('sku', 'p.itemCode::text', re='Item code:(.+)') category = response.xpath( '//ul[@id="breadcrumbNavList"]//a/span/text()').extract() if 'Home' in category: category.remove('Home') loader.add_value('category', category) image_url = response.css( 'img#productImageID::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) item = loader.load_item() item['metadata'] = {'reviews': []} review_id = response.xpath('//text()').re_first("productId: '(.+?)'") reviews_url = 'http://sainsburysgrocery.ugc.bazaarvoice.com/8076-en_gb/%s/reviews.djs?format=embeddedhtml' % review_id yield Request(reviews_url, callback=self.parse_review_page, meta={'item': item})
def parse(self, response): self.state['items_count'] = self.state.get('items_count', 0) + 1 response = response.replace(url=url_query_cleaner(response.url)) #self.log('Page: %s' % response.url) hxs = HtmlXPathSelector(response) index_level = self.determine_level(response) if index_level in [1, 2, 3, 4]: #self.save_to_file_system(index_level, response) relative_urls = self.get_follow_links(index_level, hxs) if relative_urls is not None: for url in relative_urls: yield Request(url, callback=self.parse) elif index_level == 5: #self.log('Level 5, parsing profile'); linkedin_id = self.get_linkedin_id(response.url) person_profile = LinkedinProfileParser.parse_profile(hxs) if person_profile is None: return linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup.encode("utf-8") #self.log('ID: ' + linkedin_id) if linkedin_id: m = hashlib.md5() m.update(linkedin_id) person_profile['_id'] = UnicodeDammit(m.hexdigest()).markup person_profile['profile_id'] = linkedin_id person_profile['url'] = UnicodeDammit(response.url).markup yield person_profile
def parse_product(self, response): loader = ProductLoader(Product(), response=response) identifier = response.css( 'input.productId::attr(value)').extract_first() loader.add_value('identifier', identifier) loader.add_value('url', url_query_cleaner(response.url)) loader.add_css('name', '.title h1::text') category = response.css('.breadcrumbs a::text').extract() loader.add_value('category', category[2:]) image_url = response.css( '.productDetail1 .image img::attr(src)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_value('brand', category[-1]) item = loader.load_item() for option in response.xpath('//div[@id="valStaffelSelection"]//li'): loader = ProductLoader(Product(), selector=option) loader.add_value(None, item) identifier = item['identifier'] + '-' + option.xpath( 'input/@value').extract_first() loader.replace_value('identifier', identifier) url = item['url'] + '?' + option.xpath('@class').extract_first() loader.replace_value('url', url) loader.add_css('name', 'span.label::text') price = option.css('div.price::text').extract() loader.replace_value('price', price.pop()) loader.replace_value('sku', identifier) yield loader.load_item()
def load_products(response): """Load a ProductItem from the product page response.""" loader = ProductItemLoader(item=ProductItem(),response=response) url = url_query_cleaner(response.url, ['snr'], remove=True) url = canonicalize_url(url) loader.add_value('url', url) publisher = response.xpath('//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][6]/div[2]/a[1]/span[2]/text()') if publisher is None: loader.add_xpath('developer','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][5]/div[2]/a[1]/span[2]/text()') loader.add_xpath('publisher','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][5]/div[2]/a[2]/span[2]/text()') else: loader.add_xpath('developer','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][6]/div[2]/a[1]/span[2]/text()') loader.add_xpath('publisher','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][6]/div[2]/a[2]/span[2]/text()') loader.add_xpath('release_date','//div[contains(concat(" ", normalize-space(@class), " "), " product-details-row ")][4]/div[2]/text()') loader.add_css('app_name', '.header__title ::text') loader.add_css('specs', '.game-features__title ::text') loader.add_css('genre', '.product-details__data span a.un ::text') try: price = response.css('.module-buy__info > meta:nth-child(2) ::attr(content)').extract_first() price_disc = price except: price = None price_disc = price if price is None: price = '0.00' price_disc = price loader.add_value('price', price) loader.add_value('discount_price', price_disc) loader.add_css('rating', 'div.average-rating:nth-child(1) > meta:nth-child(4) ::attr(content)') return loader.load_item()
def parse_product(self, response): options = response.css('.pg_select') if options: selected_option = options.xpath('option[@selected]') if not selected_option: for url in options.xpath('.//@data-href').extract(): yield Request(response.urljoin(url_query_cleaner(url)), self.parse_product) return loader = ProductLoader(Product(), response=response) sku = response.xpath( '//div[@id="content"]//input[@name="sku"]/@value').extract_first() loader.add_value('identifier', sku) loader.add_value('sku', sku) loader.add_value('url', response.url) loader.add_xpath('name', '//strong[@itemprop="name"]/text()') loader.add_css('price', 'div.show h5 ::text') loader.add_css('price', '.nowPrice ::text') loader.add_css('price', '.typicalPrice h5 ::text') category = response.xpath('//input[@name="productDetailsDTO"]/@value' ).re('"category":"(.+?)"') if category: loader.add_value('category', category[0].split('/')) image_url = response.css( 'ul#galleryImages a::attr(href)').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_xpath( 'brand', '//span[@itemprop="brand"]//span[@itemprop="name"]/text()') if response.css('div#content p.oos'): loader.add_value('stock', 0) yield loader.load_item()
def parse(self, response): hxs = HtmlXPathSelector(response) categories = response.xpath( '//ul[@id="category-level-1"]//a/@href').extract() for category in categories: yield Request(response.urljoin(category)) products = response.css('div.b-product_title a::attr(href)').extract() for product in products: yield Request(url_query_cleaner(response.urljoin(product)), callback=self.parse_product, meta=response.meta) pages = response.css('ul.b-pagination a::attr(href)').extract() for url in pages: yield Request(url, meta=response.meta) identifier = hxs.select( '//p[contains(@class, "productid")]/@class').re('p_(.*)') if identifier: yield Request(response.url, dont_filter=True, callback=self.parse_product, meta=response.meta)
def parse_url(self, url: URL) -> str: # Keep the query strings if they might be feed strings. # Wikipedia for example uses query strings to differentiate feeds. if any(key in url.query for key in self.valid_keys): return canonicalize_url(str(url)) # Canonicalizing the URL is about 4x slower, but worth it to prevent duplicate requests. return canonicalize_url(url_query_cleaner(str(url)))
def test_url_query_cleaner_keep_fragments(self): self.assertEqual( "product.html?id=200#foo", url_query_cleaner( "product.html?id=200&foo=bar&name=wired#foo", ["id"], keep_fragments=True, ), )
def parse_product(self, response): base_sku = response.xpath('//@data-ref').extract_first() identifier = re.search('p(\d+)$', url_query_cleaner(response.url)).group(1) url = 'https://www.andrewjamesworldwide.com/ajax/get_product_options/{0}'.format( identifier) data = json.load(urlopen(url)) attributes = [attr['values'] for attr in data['attributes']] if [] in attributes: url = add_or_replace_parameter(url, 'attributes[1]', attributes[0][0]['value_id']) data = json.load(urlopen(url)) attributes = [attr['values'] for attr in data['attributes']] variants = itertools.product(*attributes) for variant in variants: url = 'https://www.andrewjamesworldwide.com/ajax/get_product_options/{0}'.format( identifier) for idx, option in enumerate(variant): url = add_or_replace_parameter( url, 'attributes[{0}]'.format(idx + 1), option['value_id']) data = json.load(urlopen(url)) selection = data['selection'].values()[0] sku = selection['reference'].strip() if not sku and base_sku not in self.skus_found: sku = base_sku if sku not in self.skus.keys(): continue if sku in self.skus_found: self.logger.info('Duplicated SKU is found: %s' % sku) self.skus_found.add(sku) loader = ProductLoader(item=Product(), response=response) loader.add_value('sku', sku) loader.add_value('identifier', selection['product_id']) loader.add_xpath('name', '//span[@id="js-product-title"]/text()') loader.add_value('name', [option['value'] for option in variant]) loader.replace_value('name', selection['title']) loader.add_value('url', response.url) loader.add_value('price', selection['price_inc']) category = response.css('div.breadcrumb a::attr(title)').extract() loader.add_value('category', category[1:]) try: image_url = [ attr['images'][0]['image'] for attr in data['attributes'][-1]['values'] ] except IndexError: image_url = response.xpath( '//div[@id="js-product-image"]//@src').extract() loader.add_value('image_url', response.urljoin(image_url[0])) loader.add_value('brand', "Andrew James") item = loader.load_item() metadata = AndrewJamesMeta() metadata['asin'] = self.skus[sku]['ASIN'] item['metadata'] = metadata yield item
def _clean_url(self, url): """ Canonicalizes the url, as it is done in Scrapy. And keeps only USEFUL_QUERY_KEYS. It also strips the trailing slash to help identifying dupes. """ clean_url = url_query_cleaner(url, parameterlist=USEFUL_QUERY_KEYS) # , remove=True return canonicalize_url(clean_url).rstrip('/')
def fingerprint(self, lnk, **kw): url = canonicalize_url(lnk.url) # pconf = kw.get('conf') # if not pconf: # pconf = xconf.get_page(project, job, lnk.page) qo = kw.get('df_query_only') qr = kw.get('df_query_remove') if qo: url = url_query_cleaner(url, arg_to_iter(qo), remove=False) if qr: url = url_query_cleaner(url, arg_to_iter(qr), remove=True) cnf = lnk.conf mds = [lnk.page, url] for key in ('method', 'headers', 'data', 'params', 'auth', 'cookies'): mds.append(cnf.get(key)) return md5sum(mds)
def load_product(response): """Load a ProductItem from the product page response.""" loader = ProductItemLoader(item=ProductItem(), response=response) url = url_query_cleaner(response.url, ['snr'], remove=True) url = canonicalize_url(url) loader.add_value('url', url) found_id = re.findall('/app/(.*?)/', response.url) if found_id: id = found_id[0] reviews_url = f'http://steamcommunity.com/app/{id}/reviews/?browsefilter=mostrecent&p=1' loader.add_value('id', id) # Publication details. details = response.css('.details_block').extract_first() try: details = details.split('<br>') for line in details: line = re.sub('<[^<]+?>', '', line) # Remove tags. line = re.sub('[\r\t\n]', '', line).strip() for prop, name in [ ('Title:', 'title'), ('Genre:', 'genres'), ('Release Date:', 'date') ]: if prop in line: item = line.replace(prop, '').strip() loader.add_value(name, item) except: # noqa E722 pass loader.add_css('app_name', '.apphub_AppName ::text') price = response.css('.game_purchase_price ::text').extract_first() if not price: price = response.css('.discount_original_price ::text').extract_first() loader.add_css('discount_price', '.discount_final_price ::text') loader.add_value('price', price) sentiment = response.css('.game_review_summary').xpath( '../*[@itemprop="description"]/text()').extract() loader.add_value('sentiment', sentiment) loader.add_css('n_reviews', '.responsive_hidden', re='\(([\d,]+) reviews\)') loader.add_xpath( 'metascore', '//div[@id="game_area_metascore"]/div[contains(@class, "score")]/text()') early_access = response.css('.early_access_header') if early_access: loader.add_value('early_access', True) else: loader.add_value('early_access', False) return loader.load_item()
def parse_product(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) brand = response.xpath( '//span[@itemprop="http://schema.org/manufacturer"]/text()' ).extract_first() or response.xpath( '//span[@itemprop="http://schema.org/brand"]/text()' ).extract_first() identifier = hxs.select('//input[@id="itemsArray"]/@value').extract() if not identifier: return sku = response.xpath( '//*[@itemprop="mpn"]/text()').extract()[0].strip() product_loader = ProductLoader(item=Product(), selector=hxs) image_url = response.css( 'img#productMainImage::attr(src)').extract_first() if image_url: product_loader.add_value('image_url', response.urljoin(image_url)) category = response.meta.get('category', '') if not category: category = hxs.select('//div[@id="breadcrumb"]/ul/li/a/text()' ).extract()[-2].strip() product_loader.add_value('category', category) product_name = response.xpath('//div[@id="product"]//h1//text()').re( '\S+') product_loader.add_value('name', product_name) product_loader.add_xpath('url', 'link[@rel="canonical"]/@href') product_loader.add_value('url', url_query_cleaner(response.url)) product_loader.add_value('identifier', identifier.pop()) product_loader.add_value('brand', brand) product_loader.add_value('sku', sku) price = ''.join( hxs.select( '//table[contains(@class, "pricing")]//td[@class="threeColTd"][1]/text()' ).extract()).strip().split('(')[0].strip().replace(u'\xa3', '') if price: price = extract_price(price) price = price.quantize(Decimal('.01')) product_loader.add_value('price', price) else: product_loader.add_value('price', 0) stock = response.css('span.availability::text').re('\d+') if stock: product_loader.add_value('stock', stock[0]) else: product_loader.add_value('stock', 0) yield product_loader.load_item()
def parse_product(self, response): base_url = get_base_url(response) product_links = response.xpath('//div[@id="products"]//a[contains(@class,"qa-product-link")]/@href').extract() if product_links: for link in product_links: yield Request(url_query_cleaner(response.urljoin(link)), callback=self.parse_product) return product_name = response.xpath('//h1[@itemprop="name"]/text()').extract() if not product_name: return product_name = product_name[-1].strip() category = re.findall("name:'Category', value:'([^']+)'", response.body.replace("\\'", ""e;")) if category: category = category.pop().replace(""e;", "'") else: category = "" brand = response.xpath('//h1[@itemprop="name"]/span/text()').extract() brand = brand[0].strip() if brand else '' rrp_by_sku = {} sku_data = re.search(r'BC.product.skusCollection = \$.parseJSON\((.*)\);', response.body) if sku_data: sku_data = json.loads(demjson.decode(sku_data.group(1), encoding='utf8' )) rrp_by_sku = {sku.upper():str(opt['price']['high']) for sku, opt in sku_data.iteritems() if opt['price']['high']>opt['price']['low']} options = response.xpath('//li[contains(@class,"qa-variant-item-")]') for option in options: product_loader = ProductLoader(item=Product(), selector=option) sku = option.xpath('./@sku-value').extract() sku = sku[0] product_loader.add_value('sku', sku) product_loader.add_value('identifier', sku) option_name = option.xpath('./@title').extract()[0].strip() option_name = option_name.replace('One Color, One Size', '').replace(', One Size', '').replace('One Color, ', '').strip() if option_name != '': product_loader.add_value('name', product_name + ', ' + option_name) else: product_loader.add_value('name', product_name) image_url = option.xpath('./@data-img-large').extract() if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) price = extract_price(option.xpath('./@data-price').extract()[0]) product_loader.add_value('price', price) product_loader.add_value('url', response.url) product_loader.add_value('brand', brand) product_loader.add_value('category', category) product = product_loader.load_item() metadata = CRCMeta() metadata['rrp'] = rrp_by_sku.get(sku.upper(), '') product['metadata'] = metadata yield product
class RebelSport(CrawlSpider): name = 'kitbag_au-rebelsport' allowed_domains = ['rebelsport.com.au'] start_urls = [ 'http://www.rebelsport.com.au/store/fangear/soccer-football/604' ] categories = LinkExtractor( restrict_css='.secondary-menu', process_value=lambda url: add_or_replace_parameter( url, 'pageSize', '500')) pages = LinkExtractor(restrict_css='.pagination') products = LinkExtractor( restrict_css='.product', process_value=lambda url: make_variant_url(url_query_cleaner(url))) rules = (Rule(categories), Rule(products, callback='parse_product')) def parse_product(self, response): data = response.xpath('//script/text()').re('{\\\\"Variants.+}')[0] data = json.loads(data.replace('\\"', '"')) variants = data['Variants'] for variant in variants: url = response.urljoin(variant['ProductPLU']) yield Request(make_variant_url(url), self.parse_product) loader = ProductLoader(item=Product(), response=response) identifier = response.xpath( '//input[@id="ProductPLU"]/@value').extract_first() loader.add_value('identifier', identifier) loader.add_value('sku', identifier) loader.add_value('url', response.url) loader.add_xpath('name', '(//h1[@itemprop="name"]/text())[1]') metadata = {} for i in xrange(3): variant_name = data['Variant%dSelected' % (i + 1)] if variant_name and variant_name != 'N/A': loader.add_value('name', variant_name) metadata[data['Variant%dHeader' % (i + 1)]] = variant_name if 'size' in variant_name.lower(): metadata['size'] = variant_name[5:].strip() price = response.css('.price-value .currency::text').extract() loader.add_value('price', price.pop()) category = response.css('.breadcrumb a::text').extract() loader.add_value('category', category[1:]) loader.add_css('image_url', '.product-image::attr(src)') loader.add_xpath('brand', '//meta[@itemprop="brand"]/@content') loader.add_value('shipping_cost', '7.95') stock = response.css('.product-stock-widget::attr(ng-init)').re( 'AvailableOnline: (\w+)')[0] if stock != 'true': loader.add_value('stock', 0) item = loader.load_item() item['metadata'] = metadata yield item
def test_url_query_cleaner(self): self.assertEqual( 'product.html?id=200', url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'])) self.assertEqual( 'product.html?id=200', url_query_cleaner("product.html?&id=200&&foo=bar&name=wired", ['id'])) self.assertEqual( 'product.html', url_query_cleaner("product.html?foo=bar&name=wired", ['id'])) self.assertEqual( 'product.html?id=200&name=wired', url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name'])) self.assertEqual( 'product.html?id', url_query_cleaner("product.html?id&other=3&novalue=", ['id'])) self.assertEqual( 'product.html?d=1&d=2&d=3', url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False)) self.assertEqual( 'product.html?id=200&foo=bar', url_query_cleaner("product.html?id=200&foo=bar&name=wired#id20", ['id', 'foo'])) self.assertEqual( 'product.html?foo=bar&name=wired', url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True)) self.assertEqual( 'product.html?name=wired', url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True)) self.assertEqual( 'product.html?foo=bar&name=wired', url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'footo'], remove=True))
def is_href_matching(url_string: str, regex: re) -> bool: """ Check if the regex has any match in the url string. :param url_string: URL as string :param regex: Regex used to search URL :return: boolean """ if regex.search(url_query_cleaner(url_string)): return True return False
def resultsPage(self, response): table = response.selector.xpath('/html/body/div/div[3]/div[2]/table') cisResultLinks = table.css('.left-align').xpath('a/@href').extract() for link in cisResultLinks: yield Request(urlparse.urljoin(response.url, link), callback = self.surveyResult) if len(response.selector.xpath('//input[@value="Next page"]')) != 0: yield FormRequest.from_response(response, formxpath='//div[@class="page-forward"]/form[1]', url = url_query_cleaner(response.url), # workaround for scrapy problem callback = self.resultsPage)
def _link_callback(uri: str, rel: str) -> str: """Replace default link loading in xhtml2pdf. We don't want the pdf generation process to actually attempt to hit the network or filesystem so we return a placeholder data URL for links that appear to be images otherwise we simply return an empty string so that nothing is loaded.""" uri = url_query_cleaner(uri) type_, _ = mimetypes.guess_type(uri) if type_ and type_.startswith('image/'): return _BLACK_PIXEL_DATA_URL else: return ''
def test_url_query_cleaner(self): self.assertEqual('product.html?id=200', url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'])) self.assertEqual('product.html?id=200', url_query_cleaner("product.html?&id=200&&foo=bar&name=wired", ['id'])) self.assertEqual('product.html', url_query_cleaner("product.html?foo=bar&name=wired", ['id'])) self.assertEqual('product.html?id=200&name=wired', url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name'])) self.assertEqual('product.html?id', url_query_cleaner("product.html?id&other=3&novalue=", ['id'])) self.assertEqual('product.html?d=1&d=2&d=3', url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False)) self.assertEqual('product.html?id=200&foo=bar', url_query_cleaner("product.html?id=200&foo=bar&name=wired#id20", ['id', 'foo'])) self.assertEqual('product.html?foo=bar&name=wired', url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True)) self.assertEqual('product.html?name=wired', url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True)) self.assertEqual('product.html?foo=bar&name=wired', url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'footo'], remove=True))
def parse_products_list(self, response): hxs = HtmlXPathSelector(response) base_url = get_base_url(response) brand = response.meta.get('brand') for url in hxs.select('//div[@class="product_name"]//a/@href | //div[@class="product_features"]/h3/a/@href').extract(): self.jar_counter += 1 yield Request(url_query_cleaner(response.urljoin(url)), callback=self.parse_product, cookies={}, meta={'cookiejar': self.jar_counter, 'brand': brand}) for url in hxs.select('//ul[@class="catthumb_list clearfix"]//div[@class="title"]/a/@href').extract(): yield Request(urljoin_rfc(base_url, url), callback=self.parse_products_list)
def parse_product(self, response): if response.url.endswith('page-not-found.page'): return formdata = {} for inp in response.xpath('//form[@id="variant-form"]//input'): formdata[inp.xpath('@name').extract_first()] = inp.xpath( '@value').extract_first() if not formdata: self.logger.warning('No data on %s' % response.url) return del formdata[None] options = response.css('.vContainer .variantDataElement') for option in options: formdata[option.xpath('@name').extract_first()] = option.xpath( '@data-variant-value').extract_first() r = FormRequest.from_response( response, formxpath='//form[@id="variant-form"]', formdata=formdata, callback=self.parse_product) yield r loader = ProductLoader(item=Product(), response=response) sku = response.xpath('//input[@id="skuIdVal"]/@value').extract_first() if sku != url_query_parameter(response.url, 'skuId'): url = add_or_replace_parameter(url_query_cleaner(response.url), 'skuId', sku) yield Request(url, self.parse_product) return loader.add_value('identifier', sku) loader.add_value('url', response.url) loader.add_xpath('name', '//h1[@id="productLabel"]//text()') #loader.add_css('name', '.selected .variantDisplayName_title ::text') loader.add_css('price', '.current-price ::text') loader.add_value('sku', sku) category = response.xpath( '//div[@id="breadcrumb"]//li//span[@itemprop="title"]/text()' ).extract() loader.add_value('category', category[-4:-1]) image_url = response.xpath( '//img[@itemprop="image"]/@src').extract_first() if image_url: loader.add_value('image_url', response.urljoin(image_url)) loader.add_xpath( 'brand', '//div[@itemprop="brand"]//span[@itemprop="name"]/text()') loader.add_value('shipping_cost', 3) #if not response.css('.stock-tag.in-stock') and not response.xpath('//link[@href="http://schema.org/InStock"]') and not response.css('.available-from'): if not response.css('.add-to-basket'): loader.add_value('stock', 0) if loader.get_output_value('price'): yield loader.load_item()
def is_valid_filetype(url: str) -> bool: """ Check if url string has an invalid filetype extension. :param url: URL string :return: boolean """ # if file_regex.search(url.strip()): # return False # return True suffix = pathlib.Path(url_query_cleaner(url)).suffix.strip(".").lower() if suffix in invalid_filetypes: return False return True
def img_base64_to_link(self, objs: QuerySet, html: str): """ HTML String의 Base64 이미지들을 objs의 Queryset에 있는 이미지 url로 replace하여 새 HTML String을 반환 :param objs: :param html: :return: """ soup = BeautifulSoup(html, 'html.parser') img_tags = soup.find_all("img") for obj, img_tag in zip(objs, img_tags): # Get rid of Amazon Token url = url_query_cleaner(obj.image.url) new_img_tag = soup.new_tag('img', src=url) img_tag.replace_with(new_img_tag) return str(soup)
def parse_search_results(self, response): products = response.xpath( '//table[@id="sProdList"]/tbody/tr[td[@class="productImage"]]') for product in products: sku = product.css('p.wordBreak a::text').extract_first() if sku and sku.strip().upper() == response.meta['sku']: url = product.xpath( './/a[@class="sku"]/@href').extract_first().strip() url = url_query_cleaner(url) yield Request(url, self.parse_product) sku = response.xpath('//*[@itemprop="mpn"]/text()').extract_first() if not products and sku and sku.strip().upper( ) == response.meta['sku']: yield Request(url_query_cleaner(response.url), self.parse_product, dont_filter=True) urls = response.css('ul.categoryList a::attr(href)').extract() if not products and not sku and urls: for url in urls: yield Request(url, self.parse_search_results, meta=response.meta)
def parse(self, response): self.state['items_count'] = self.state.get('items_count', 0) + 1 response = response.replace(url=url_query_cleaner(response.url)) hxs = HtmlXPathSelector(response) index_level = self.determine_level(response) if index_level in [1, 2]: relative_urls = self.get_follow_links(index_level, hxs) if relative_urls is not None: for url in relative_urls: yield Request(url, callback=self.parse) elif index_level == 3: vacature = ITBanenParser.parse_profile(hxs) if vacature is None: return vacature['url'] = UnicodeDammit(response.url).markup yield vacature
def parse_content(self,response): self.log("Crawled %s %d"%(response.url,response.status),level=scrapy.log.INFO) #self.log("Crawled (%d) <GET %s>"%(response.status,response.url),level=scrapy.log.INFO) if response.status / 100 != 2: yield scrapy.Request(url=response.url,callback=self.parse_content) return base_url = get_base_url(response) #解析文章 for href in response.xpath('//table//a/@href').extract(): if "view_abstract.aspx?" in href: href = url_query_cleaner(href,("file_no")) elif"create_pdf.aspx?" in href: pass else: continue abs_url =urljoin_rfc(base_url,href) yield self.baidu_rpc_request({"url":abs_url,"src_id":22},furl=response.url)
def parse_hotel(self, response): hxs = Selector(response) hotel = HtmlParser.extract_hotel(response.url, hxs) checkin = url_query_parameter(response.url,"checkin") checkout = url_query_parameter(response.url,"checkout") checkinDatetime = None checkoutDatetime = None today = datetime.date.today() if checkin is not None: checkinDatetime = datetime.datetime.strptime(checkin, "%Y-%m-%d").date() checkinDatetime = self.add_months(checkinDatetime,1) else: checkinDatetime = datetime.date(today.year, today.month, 15) if checkout is not None: checkoutDatetime = datetime.datetime.strptime(checkout, "%Y-%m-%d").date() checkoutDatetime = self.add_months(checkoutDatetime,1) else: checkoutDatetime = datetime.date(today.year, today.month, 16) maxDatetime = self.add_months(today,18) if checkinDatetime < maxDatetime: url = url_query_cleaner(response.url) url = add_or_replace_parameter(url,"checkin",str(checkinDatetime)) url = add_or_replace_parameter(url,"checkout",str(checkoutDatetime)) #logging.warning('---------------------------- %s' % url) yield Request(url, callback=self.parse_hotel) yield hotel["hotel"] if len(hotel["rooms"]) > 0: for room in hotel["rooms"]: yield room
def parse(self, response): # Get product details if /ip/ is in the URL if '/ip/' in response.url: # Remove the unnecessary parameters from the product url clean_url = url_query_cleaner(response.url) # Create a new Product p = Product() p['url'] = clean_url p['title'] = response.xpath("//h1[@itemprop='name']/span/text()").extract()[0].strip() price_data = response.xpath("//div[@itemprop='price']//text()").extract() if price_data: p['price'] = Decimal("".join(price_data[2:7])) else: p['price'] = Decimal("0") yield p # Check all of the links on the current page for link in response.xpath("//a/@href").extract(): # Create an absolute url abs_url = urlparse.urljoin(response.url, link.strip()) # Create a new request for a spider to crawl yield Request(url=abs_url)
def __parse_as_next_page__(self, response): refer = response.request.headers.get('Referer') chart = self.__chart_items[url_query_cleaner(refer)] yield self.do_parse(chart, response)
def url_cleaner(url): url = url_query_cleaner(url) url = url.replace('://www','://') return url
def test_url_query_cleaner_keep_fragments(self): self.assertEqual('product.html?id=200#foo', url_query_cleaner("product.html?id=200&foo=bar&name=wired#foo", ['id'], keep_fragments=True))
def remove_url_parameter(url): return url_query_cleaner(url)
def test_url_query_cleaner(self): self.assertEqual('product.html?id=200', url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'])) self.assertEqual('product.html?id=200', url_query_cleaner("product.html?&id=200&&foo=bar&name=wired", ['id'])) self.assertEqual('product.html', url_query_cleaner("product.html?foo=bar&name=wired", ['id'])) self.assertEqual('product.html?id=200&name=wired', url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id', 'name'])) self.assertEqual('product.html?id', url_query_cleaner("product.html?id&other=3&novalue=", ['id'])) # default is to remove duplicate keys self.assertEqual('product.html?d=1', url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'])) # unique=False disables duplicate keys filtering self.assertEqual('product.html?d=1&d=2&d=3', url_query_cleaner("product.html?d=1&e=b&d=2&d=3&other=other", ['d'], unique=False)) self.assertEqual('product.html?id=200&foo=bar', url_query_cleaner("product.html?id=200&foo=bar&name=wired#id20", ['id', 'foo'])) self.assertEqual('product.html?foo=bar&name=wired', url_query_cleaner("product.html?id=200&foo=bar&name=wired", ['id'], remove=True)) self.assertEqual('product.html?name=wired', url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'foo'], remove=True)) self.assertEqual('product.html?foo=bar&name=wired', url_query_cleaner("product.html?id=2&foo=bar&name=wired", ['id', 'footo'], remove=True)) self.assertEqual('product.html?foo=bar', url_query_cleaner("product.html?foo=bar&name=wired", 'foo')) self.assertEqual('product.html?foobar=wired', url_query_cleaner("product.html?foo=bar&foobar=wired", 'foobar'))