예제 #1
0
 def _parse_related_products(self, response):
     prod = response.meta['product']
     html = re.search(r"html:'(.+?)'\}\]\},", response.body_as_unicode())
     if not html:
         return prod
     html = Selector(text=html.group(1))
     key_name = is_empty(html.css('.rrStrat::text').extract())
     items = html.css('.rrRecs > ul > li')
     rel_prods = []
     for item in items:
         title = is_empty(item.css('.rrItemName > a ::text').extract())
         url = is_empty(item.css('a.rrLinkUrl::attr(href)').extract())
         url = urlparse.urlparse(url)
         qs = urlparse.parse_qs(url.query)
         url = is_empty(qs['ct'])
         rel_prods.append(RelatedProduct(title=title, url=url))
     prod['related_products'] = {key_name: rel_prods}
     return prod
예제 #2
0
    def start_requests(self):
        s = requests.Session()
        a = requests.adapters.HTTPAdapter(max_retries=3)
        b = requests.adapters.HTTPAdapter(max_retries=3)
        s.mount('http://', a)
        s.mount('https://', b)
        body = '{{"login": {{"username": "******", "password": "******"}}, '.format(
            email=self.login, password=self.password)
        with requests.Session() as s:
            # Set auth cookies
            s.get(self.AUTH_URL,
                  data=body,
                  headers=self.AUTH_HEADERS,
                  timeout=5)
            # An authorised request.
            response = s.post(self.start_urls[0],
                              headers=self.AUTH_HEADERS,
                              timeout=5)
            response = response.text

            total_match = Selector(text=response).xpath(
                '//div[@class="pager"]/p[@class="amount"]/text()').extract()
            if total_match:
                page_links = []
                total_match = re.search('(\d+) gesamt', total_match[0])
                if total_match:
                    total_match = total_match.group(1)
                    page_count = int(total_match) / 50
                    if page_count * 25 < total_match:
                        page_count += 1
                    for i in range(1, page_count + 1):
                        page_link = self.start_urls[0] + '?p=' + str(i)
                        page_links.append(page_link)
                    for page_link in page_links:
                        yield scrapy.Request(url=page_link,
                                             callback=self.parse_links,
                                             headers=self.HEADERS,
                                             dont_filter=True)
            else:
                yield scrapy.Request(url=self.start_urls[0],
                                     callback=self.parse_links,
                                     headers=self.HEADERS,
                                     dont_filter=True)