def test_get_review_identifier_from_url(self): url = "http://www.amazon.com/review/RRSU6FI3Y6D5I/ref=cm_cr_pr_viewpnt#RRSU6FI3Y6D5I" self.assertEqual(AmazonUrlCreator.get_review_identifier_from_url(url), 'RRSU6FI3Y6D5I') url = "http://www.amazon.co.uk/review/R3E56QODZNNIW4/ref=cm_cr_pr_perm?ie=UTF8&ASIN=0735619670" self.assertEqual(AmazonUrlCreator.get_review_identifier_from_url(url), 'R3E56QODZNNIW4')
def test_domain_name_from_url(self): url = "http://www.amazon.com/gp/product/B001OW7JT8/ref=s9_psimh_gw_p201_d1_i2?pf_rd_m=ATVPDKIKX0DER&pf_rd_s=center-2&pf_rd_r=0QB7BBXPS3J4660YQN0X&pf_rd_t=101&pf_rd_p=1688200382&pf_rd_i=507846" self.assertEqual(AmazonUrlCreator.get_domain_from_url(url), 'www.amazon.com') url = "http://www.amazon.co.uk/gp/product/B001UHOQ98/ref=s9_simh_gw_p200_d17_i3?pf_rd_m=A3P5ROKL5A1OLE&pf_rd_s=center-2&pf_rd_r=0XD4CJ7P35531QB9PRQM&pf_rd_t=101&pf_rd_p=455345507&pf_rd_i=468294" self.assertEqual(AmazonUrlCreator.get_domain_from_url(url), 'www.amazon.co.uk')
def test_process_next_step_redirects_to_next_search(self): engine = MockEngine() crawler = MockCrawler(engine) class TestSpider(BaseAmazonSpider): name = "Test Spider" type = 'asins' def get_asins_generator(self): for text in ["BA123", "BA456"]: item = {'asin': text, 'sku': text.lower()} yield item['asin'], item['sku'] spider = TestSpider('amazon.com') spider.crawler = crawler spider.start_requests() spider.collected_items = [] self.assertRaises(DontCloseSpider, spider.process_next_step, spider) request = engine.last_request self.assertEqual(engine.crawl_called, 1) # check redirects to home page self.assertIs(spider, engine.last_spider) self.assertEqual( request.url, AmazonUrlCreator.build_url_from_asin('amazon.com', "BA456")) self.assertTrue(request.dont_filter) self.assertEqual(request.meta['search_item']['asin'], "BA456") self.assertEqual(request.meta['search_item']['sku'], "BA456".lower()) # check callback yields items self.assertEqual(request.callback, spider.parse_product)
def build_url_amazon_direct(domain, url): domain = AmazonUrlCreator._fix_domain(domain) if '.com' in domain: amazon_id = 'ATVPDKIKX0DER' elif '.co.uk' in domain: amazon_id = 'A3P5ROKL5A1OLE' elif '.fr' in domain: amazon_id = 'A1X6FK5RDHNB96' elif '.it' in domain: amazon_id = 'A11IL2PNWYJU7H' elif '.de' in domain: amazon_id = 'A3JWKAKR8XB7XF' elif '.ca' in domain: amazon_id = 'A3DWYIK6Y9EEQB' elif '.es' in domain: amazon_id = 'A1AT7YVPFBWXBL' else: raise AmazonUrlCreatorException('Domain %s not found!' % domain) url = add_or_replace_parameter(url, 'm', amazon_id) return url
def test_build_url_from_asin(self): domain = 'amazon.com' asin = '123456' expected_url = 'http://www.amazon.com/gp/product/123456/?ref=twister_dp_update&ie=UTF8&psc=1' self.assertEqual(AmazonUrlCreator.build_url_from_asin(domain, asin), expected_url)