def _parse_related_products(self, response): prod = response.meta['product'] html = re.search(r"html:'(.+?)'\}\]\},", response.body_as_unicode()) if not html: return prod html = Selector(text=html.group(1)) key_name = is_empty(html.css('.rrStrat::text').extract()) items = html.css('.rrRecs > ul > li') rel_prods = [] for item in items: title = is_empty(item.css('.rrItemName > a ::text').extract()) url = is_empty(item.css('a.rrLinkUrl::attr(href)').extract()) url = urlparse.urlparse(url) qs = urlparse.parse_qs(url.query) url = is_empty(qs['ct']) rel_prods.append(RelatedProduct(title=title, url=url)) prod['related_products'] = {key_name: rel_prods} return prod
def start_requests(self): s = requests.Session() a = requests.adapters.HTTPAdapter(max_retries=3) b = requests.adapters.HTTPAdapter(max_retries=3) s.mount('http://', a) s.mount('https://', b) body = '{{"login": {{"username": "******", "password": "******"}}, '.format( email=self.login, password=self.password) with requests.Session() as s: # Set auth cookies s.get(self.AUTH_URL, data=body, headers=self.AUTH_HEADERS, timeout=5) # An authorised request. response = s.post(self.start_urls[0], headers=self.AUTH_HEADERS, timeout=5) response = response.text total_match = Selector(text=response).xpath( '//div[@class="pager"]/p[@class="amount"]/text()').extract() if total_match: page_links = [] total_match = re.search('(\d+) gesamt', total_match[0]) if total_match: total_match = total_match.group(1) page_count = int(total_match) / 50 if page_count * 25 < total_match: page_count += 1 for i in range(1, page_count + 1): page_link = self.start_urls[0] + '?p=' + str(i) page_links.append(page_link) for page_link in page_links: yield scrapy.Request(url=page_link, callback=self.parse_links, headers=self.HEADERS, dont_filter=True) else: yield scrapy.Request(url=self.start_urls[0], callback=self.parse_links, headers=self.HEADERS, dont_filter=True)