def parse_products_page(self, response): results_script = response.xpath("//script[contains(., 'ProductResultsDesktop.ProductResults')]/text()").extract() results = transform_products_data(results_script[0]) for json in results['data']['ProductResult']['Products']: p = PProduct() p['name'] = json['Title'] p['pid'] = json['Id'] p['url'] = json['ProductPageUrl'] p['json'] = json yield p product_page_pattern = "//p[@class='product-title']/a/@href" product_urls = response.xpath(product_page_pattern) # self.log(pprint(product_urls)) for href in product_urls: base_product = href.extract().split('?')[0] product_url = urlparse.urljoin(response.url, base_product)
def test_product_data_transform(self): test_str = "React.render(React.createElement(ProductResultsDesktop.ProductResults, {\"data\":{\"ProductResult\":{\"time\":7}}}), document.getElementById( 'npr-product-results-page' ));" val = transform_products_data(test_str) assert len(val.keys()) > 0