def next_request(self): ''' Logic to handle getting a new url request, from a bunch of different queues ''' t = time.time() # update the redis queues every so often if t - self.update_time > self.update_interval: self.update_time = t self.create_queues() # update the ip address every so often if t - self.update_ip_time > self.ip_update_interval: self.update_ip_time = t self.update_ipaddress() self.report_self() item = self.find_item() if item: self.logger.debug("Found url to crawl {url}" \ .format(url=item['url'])) try: if 'request' in item: req = request_from_dict(pickle.loads(item['request']), self.spider) else: req = Request(item['url'], meta=make_splash_meta({})) except ValueError: # need absolute url # need better url validation here req = Request('http://' + item['url'], meta=make_splash_meta({})) if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in item.keys(): if key != 'request': req.meta[key] = item[key] # extra check to add items to request if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], basestring): req.cookies = self.parse_cookie(item['cookie']) return req return None
def next_request(self): ''' Logic to handle getting a new url request, from a bunch of different queues ''' t = time.time() # update the redis queues every so often if t - self.update_time > self.update_interval: self.update_time = t self.create_queues() # update the ip address every so often if t - self.update_ip_time > self.ip_update_interval: self.update_ip_time = t self.update_ipaddress() self.report_self() item = self.find_item() if item: self.logger.debug("Found url to crawl {url}" \ .format(url=item['url'])) try: if 'request' in item: req = request_from_dict(pickle.loads(item['request']), self.spider) else: req = Request(item['url'], meta=make_splash_meta({})) except ValueError: # need absolute url # need better url validation here req = Request('http://' + item['url'], meta=make_splash_meta({})) if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in item.keys(): if key != 'request' : req.meta[key] = item[key] # extra check to add items to request if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], basestring): req.cookies = self.parse_cookie(item['cookie']) return req return None
def next_request(self): """ Logic to handle getting a new url request, from a bunch of different queues """ t = time.time() # update the redis queues every so often if t - self.update_time > self.update_interval: self.update_time = t self.create_queues() # update the ip address every so often if t - self.update_ip_time > self.ip_update_interval: self.update_ip_time = t self.update_ipaddress() self.report_self() item = self.find_item() if item: self.logger.debug("Found url to crawl {url}".format(url=item["url"])) try: req = Request(item["url"]) except ValueError: # need absolute url # need better url validation here req = Request("http://" + item["url"]) if "meta" in item: item = item["meta"] # defaults not in schema if "curdepth" not in item: item["curdepth"] = 0 if "retry_times" not in item: item["retry_times"] = 0 for key in item.keys(): req.meta[key] = item[key] # extra check to add items to request if "useragent" in item and item["useragent"] is not None: req.headers["User-Agent"] = item["useragent"] if "cookie" in item and item["cookie"] is not None: if isinstance(item["cookie"], dict): req.cookies = item["cookie"] elif isinstance(item["cookie"], basestring): req.cookies = self.parse_cookie(item["cookie"]) return req return None
def process_request(self, request: Request, spider): if (spider.name == 'cookiespider'): cookies = CookieUtils.getCookies() logging.info( "===================================被cookie中间件处理, cookie是%s" % cookies) request.cookies = cookies return None
def make_requests_from_url(self, url, id=None, attr=None): #request = Request(url,headers={"Accept-Encoding": "gzip,deflate,sdch","Accept-Language": "en-US,en;q=0.8" , "User-Agent": "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.114 Safari/537.36" , "Accept": "*/*" ,"Referer": "https://www.amazon.de" , "Connection": "keep-alive" }, dont_filter=True) request = Request(url,headers={'Origin': 'https://www.amazon.de', 'Referer':'https://www.amazon.de', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.8', 'Upgrade-Insecure-Requests': '1' , 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36, LuminadBot/1.0 ([email protected])', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' ,'Cache-Control': 'max-age=0' }, dont_filter=True) request.cookies ={'s_pers':'%20s_fid%3D300B8810F7CDBDE1-10092DE00A8359D7%7C1558680220920%3B%20s_dl%3D1%7C1495610020921%3B%20gpv_page%3DDE%253AAZ%253ASOA-Landing%7C1495610020924%3B%20s_ev15%3D%255B%255B%2527AZDEGNOSellC%2527%252C%25271495608209183%2527%255D%252C%255B%2527AZDEGNOSellC%2527%252C%25271495608216403%2527%255D%252C%255B%2527AZDEGNOSellC%2527%252C%25271495608220916%2527%255D%252C%255B%2527AZDEGNOSellC%2527%252C%25271495608220925%2527%255D%255D%7C1653374620925%3B%20s_eVar26%3DAmazon%2520Services%2520DE%7C1498200220927%3B', 'amznacsleftnav-656eac4a-695b-3a6a-946f-db61e4deb392':'1', 'amznacsleftnav-fdfd699f-c863-3b78-85b2-8a649c6b58f6':'1', 'x-amz-captcha-1':'1508482986892769', 'x-amz-captcha-2':'hw3RhTh0tvhX81cdMFkFgQ==', 'lc-acbde':'de_DE', 'session-id':'261-5677163-4561642', 'ubid-acbde':'259-8821950-7904223', 'a-ogbcbff':'1', 'x-acbde':'"V0z3CSC5jraR2B7OY6OiPR3wrDO7GbRjA9fTg2AJTorXXbAPToPEDvMAo8KTh@7M"', 'at-acbde':'Atza|IwEBIHwqc3CD45BqlJs_5aa-V8dGYqRemzUHaOJhdARXf-o6rlAp0DANlQO8ZPGB23Uek573IjBb2qkX4mlZWKna1Xn3pOzTpiUd0SQO7gh-uTZnxF5r2p22mMsR4_clEZvBBlZBMJYXD6HPxW7_sEYtklqCkY-Br197rDnz9KPza3y5u7XzgezJIBdXCaeq4vAqo9Wrl0uG0RGKSr41-4rKK9hpnGK1nN4UbO_qWxnLSwzA6LwgXczqe0C5EyH1HIp12IlKFB7OgxIEsH0QZAiT0eh0D7sFwlVG6eHfqPNWfix03SZ7apAC7C7jQ-vw1lmICAeJciD9QmumuCNEDDCT-GGWCkrAh-gxMRhKpm7Q5_gOtJijbqoLi3VfPO9QrCA7hYW8Atc-kFRIW3Y6vtRc8OZzZipCneewy-Rj_xYUMFVWMCmHs_ljfe2W6vxWgiRfmyw', 'sess-at-acbde':'"NbwPRqfG4oPuznYLUmFM5Y5JSvyizaA9ZJz6vTkNQL4="', 'sst-acbde':'Sst1|PQEs5smXCO43G8WIotdsANHyCEBZ9TkcZ_OdLYTgnk2mCfAy4Z5W77Y7zX74BQuxS7UKtfnUM6KkKhmcu01A2Fq7xshyjesDvnQDYp9QYcrFDvlceaVvpWqQfpEt2Q9XIM0VQFdd2EMpXc4C9QlehgHT0URfOlUmC47BkfeJr5dpb4Pv_dbnFASQli0k7Cln9sN_Vf4Wqz4km-6UTpsNlVJxJE48_RK6Zsk7bklH_cpJE8tfltiPzdhyhY2oDh7SieUx6CNKphxtIezjzr-0SbD8cg', 'x-wl-uid':'11PAl+O2T6FeY67SmgtWeMBtyZ538YMsy2Zcpov67B4kL2DVIv3Nx7rEprTLBkI4W3ZZ954YAADFuG1oAMSt9uIgNhk3yQfBCY6pDMJUcXUzK6rFTPF4tPnrWr3utKPzHqJATwvQOHKE=', 'session-token':'"tzfdQwuhV4SLJ9/PfV3QSfg2b3LxOcRlqovsFb3AsrqZSnkxHCjhgMsO3d7NbIS7rOee9CPoh7Lxo8LF7EdVopNDFYLMzzOtDGVhnY4czMEVNS5VHAxjtdaDvRNDJC0OloD0EvRMDfHeXG70D93/wWVNfqU0c6nKEv0yTLU7pFpIbTicUYQQFeDZYf9tPQEepQxbZ1pBOU+0FjTwWUj3SnNdDf/SVmmk+feDLRuqn+WcP6w6CPQ1G03W/TACUuIHBz9mSMRFPU0il4m+s0KyzA=="', 'csm-hit':'s-F8Q4HD9WHE8M6GMQKQT4|1519186540551', 'session-id-time':'2082754801l' } #request.cookies ={ 'x-wl-uid':'1yOwLjX2WnY9mLM7WsqYh6e6V1fXMd1ZMNtSL2K4PXEdSmASj6jCPPBezf56CZBu8dNd+B0dbGk6FSb6sv3/5Z2bObc/d7RBUn4jelvgzhpzxeiQQPCByKtKt+rFfaF6lordo7OBLv6I=', 's_vn':'1538041742354%26vn%3D1','s_fid':'7FA70D7094115718-2F7725F9CDA62241', 'regStatus':'pre-register', 's_nr':'1506673939908-Repeat', 's_vnum':'1938673939908%26vn%3D1', 's_dslv':'1506673939908', 'JSESSIONID':'7D8C49FEC5F5D74FBFB8C44B4582E920', 'skin':'noskin','session-token':'fMF7GsLbD9OFUtBEffIAbQYQ+k+oGY4qtqc4L+jpdCrQuiLu4c9Hm8YSsbtiO5c9mfQ3IRuuQojX/N/SOZ1vcQVF58RRX0RpMeXLEPvV50aTQq+f/s/rV8yGoETGydD/29yEVxxEqc4cWCblz5+V28+sOHeSSoUiYwysN7+jUIC+ICgHh8EJAM1aQiONRz31', 'ubid-main':'131-1502033-8002851', 'session-id-time':'2082787201l', 'session-id':'143-4281452-3926723', 'csm-hit':'%7B%22tb%22%3A%223FYTGMTG10SZNP3AYFTN%2Bs-TWA04Y4WMDA93A0N8PZQ%7C1507802966608%22%7D' } if id:request.meta['id']=id if attr:request.meta['attr']=attr # set the meta['item'] to use the item in the next call back return request
def request_from_feed(self, item): try: req = Request(item['url']) except ValueError: # need absolute url # need better url validation here req = Request('http://' + item['url']) # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in list(item.keys()): req.meta[key] = item[key] # extra check to add items to request if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], string_types): req.cookies = self.parse_cookie(item['cookie']) return req
def next_request(self): ''' Logic to handle getting a new url request, from a bunch of different queues ''' t = time.time() # update the redis queues every so often if t - self.update_time > self.update_interval: self.update_time = t self.create_queues() self.expire_queues() # update the ip address every so often if t - self.update_ip_time > self.ip_update_interval: self.update_ip_time = t self.update_ipaddress() self.report_self() item = self.find_item() if item: self.logger.debug("Found url to crawl {url}" \ .format(url=item['url'])) try: req = Request(item['url']) except ValueError: # need absolute url # need better url validation here req = Request('http://' + item['url']) try: if 'callback' in item and item['callback'] is not None: req.callback = getattr(self.spider, item['callback']) except AttributeError: self.logger.warn("Unable to find callback method") try: if 'errback' in item and item['errback'] is not None: req.errback = getattr(self.spider, item['errback']) except AttributeError: self.logger.warn("Unable to find errback method") if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in list(item.keys()): req.meta[key] = item[key] # extra check to add items to request if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], basestring): req.cookies = self.parse_cookie(item['cookie']) return req return None
def next_request(self): ''' Logic to handle getting a new url request, from a bunch of different queues ''' t = time.time() # update the redis queues every so often if t - self.update_time > self.update_interval: self.update_time = t self.create_queues() item = self.find_item() if item: self.logger.info( 'distributed_scheduler.py::DistributedScheduler::next_request call find_item() result is : %s' % (item["meta"]["url"] if 'meta' in item else item["url"])) self.logger.debug("Found url to crawl {url}" \ .format(url=item['url'])) try: req = Request(item['url']) except ValueError: # need absolute url # need better url validation here req = Request('http://' + item['url']) if 'callback' in item: cb = item['callback'] if cb and self.spider: cb = get_method(self.spider, cb) req.callback = cb if 'errback' in item: eb = item['errback'] if eb and self.spider: eb = get_method(self.spider, eb) req.errback = eb if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in item.keys(): req.meta[key] = item[key] # extra check to add items to request if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], basestring): req.cookies = self.parse_cookie(item['cookie']) return req return None
def parseURL(self, response): site = response.meta['origin_site'] hxs = HtmlXPathSelector(response) product_model = "" product_brand = "" product_price = "" #############################################################3 # Extract product attributes (differently depending on site) if site == 'staples': product_name = hxs.select("//h1/text()").extract()[0] model_nodes = hxs.select( "//p[@class='itemModel']/text()").extract() if model_nodes: model_node = model_nodes[0] model_node = re.sub("\W", " ", model_node, re.UNICODE) m = re.match("(.*)Model:(.*)", model_node.encode("utf-8"), re.UNICODE) if m: product_model = m.group(2).strip() elif site == 'walmart': product_name_holder = hxs.select( "//h1[@class='productTitle']/text()").extract() if product_name_holder: product_name = product_name_holder[0].strip() # get integer part of product price product_price_big = hxs.select( "//span[@class='bigPriceText1']/text()").extract() if not product_price_big: self.log("Didn't find product price: " + response.url + "\n", level=log.DEBUG) # if there is a range of prices take their average if len(product_price_big) > 1: # remove $ and . product_price_min = re.sub("[\$\.,]", "", product_price_big[0]) product_price_max = re.sub("[\$\.,]", "", product_price_big[-1]) #TODO: check if they're ints? product_price_big = (int(product_price_min) + int(product_price_max)) / 2.0 elif product_price_big: product_price_big = int( re.sub("[\$\.,]", "", product_price_big[0])) # get fractional part of price #TODO - not that important if product_price_big: product_price = product_price_big else: sys.stderr.write( "Broken product page link (can't find item title): " + response.url + "\n") # return the item as a non-matched item item = SearchItem() #item['origin_site'] = site item['origin_url'] = response.url # remove unnecessary parameters m = re.match("(.*)\?enlargedSearch.*", item['origin_url']) if m: item['origin_url'] = m.group(1) #item['origin_id'] = self.extract_walmart_id(item['origin_url']) if self.name != 'manufacturer': # don't return empty matches in manufacturer spider yield item return #TODO: if it contains 2 words, first could be brand - also add it in similar_names function product_model_holder = hxs.select( "//td[contains(text(),'Model')]/following-sibling::*/text()" ).extract() if product_model_holder: product_model = product_model_holder[0] #TODO: for the sites below, complete with missing logic, for not returning empty elements in manufacturer spider elif site == 'newegg': product_name_holder = hxs.select( "//span[@itemprop='name']/text()").extract() if product_name_holder: product_name = product_name_holder[0].strip() else: sys.stderr.write( "Broken product page link (can't find item title): " + response.url + "\n") item = SearchItem() #item['origin_site'] = site item['origin_url'] = response.url yield item return product_model_holder = hxs.select( "//dt[text()='Model']/following-sibling::*/text()").extract() if product_model_holder: product_model = product_model_holder[0] else: raise CloseSpider("Unsupported site: " + site) if site == 'staples': zipcode = "12345" cookies = {"zipcode": zipcode} else: cookies = {} ####################################################################### # Create search queries to the second site, based on product attributes request = None #TODO: search by alternative model numbers? #TODO: search by model number extracted from product name? Don't I do that implicitly? no, but in combinations # if there is no product model, try to extract it if not product_model: product_model = ProcessText.extract_model_from_name(product_name) # for logging purposes, set this back to the empty string if it wasn't found (so was None) if not product_model: product_model = "" # product_model_index = ProcessText.extract_model_nr_index(product_name) # if product_model_index >= 0: # product_model = product_name[product_model_index] ## print "MODEL EXTRACTED: ", product_model, " FROM NAME ", product_name # if there is no product brand, get first word in name, assume it's the brand product_brand_extracted = "" #product_name_tokenized = ProcessText.normalize(product_name) product_name_tokenized = [ word.lower() for word in product_name.split(" ") ] #TODO: maybe extract brand as word after 'by', if 'by' is somewhere in the product name if len(product_name_tokenized) > 0 and re.match( "[a-z]*", product_name_tokenized[0]): product_brand_extracted = product_name_tokenized[0].lower() # if we are in manufacturer spider, set target_site to manufacturer site # for manufacturer spider set target_site of request to brand extracted from name for this particular product if self.name == 'manufacturer': #TODO: restore commented code; if brand not found, try to search for it on every manufacturer site (build queries fo every supported site) # hardcode target site to sony #self.target_site = 'sony' #self.target_site = product_brand_extracted #target_site = product_brand_extracted # can only go on if site is supported # (use dummy query) #if target_site not in self.build_search_pages("").keys(): if product_brand_extracted not in self.build_search_pages( "").keys(): product_brands_extracted = set( self.build_search_pages("").keys()).intersection( set(product_name_tokenized)) if product_brands_extracted: product_brand_extracted = product_brands_extracted.pop() #target_site = product_brand_extracted else: # give up and return item without match self.log( "Manufacturer site not supported (" + product_brand_extracted + ") or not able to extract brand from product name (" + product_name + ")\n", level=log.ERROR) ## comment lines below to: don't return anything if you can't search on manufacturer site # item = SearchItem() # item['origin_url'] = response.url # item['origin_name'] = product_name # if product_model: # item['origin_model'] = product_model # yield item return # if specific site is not set, search on manufacturer site as extracted from name if not self.manufacturer_site: target_site = product_brand_extracted else: # if it's set, continue only if it matches extracted brand if self.manufacturer_site != product_brand_extracted: self.log( "Will abort matching for product, extracted brand does not match specified manufacturer option (" + product_brand_extracted + ")\n", level=log.INFO) ## comment lines below to: don't return anything if you can't search on manufacturer site # item = SearchItem() # item['origin_url'] = response.url # item['origin_name'] = product_name # if product_model: # item['origin_model'] = product_model # yield item return else: target_site = product_brand_extracted # # try to match it without specific site (manufacturer spider will try to search on all manufacturer sites) # target_site = None # for other (site specific) spiders, set target_site of request to class variable self.target_site set in class "constructor" (init_sub) else: target_site = self.target_site # 1) Search by model number if product_model: #TODO: model was extracted with ProcessText.extract_model_from_name(), without lowercasing, should I lowercase before adding it to query? query1 = self.build_search_query(product_model) search_pages1 = self.build_search_pages(query1) #page1 = search_pages1[self.target_site] page1 = search_pages1[target_site] request1 = Request(page1, callback=self.parseResults) # set amazon cookies if (self.target_site == 'amazon' and self.cookies_file): request1.cookies = self.amazon_cookies request1.headers['Cookies'] = self.amazon_cookie_header #request1.meta['dont_merge_cookies'] = True ## print "SET AMAZON COOKIES" request1.meta['query'] = query1 request1.meta['target_site'] = target_site request = request1 # 2) Search by product full name query2 = self.build_search_query(product_name) search_pages2 = self.build_search_pages(query2) #page2 = search_pages2[self.target_site] page2 = search_pages2[target_site] request2 = Request(page2, callback=self.parseResults) # set cookies for amazon if (self.target_site == 'amazon' and self.cookies_file): request2.cookies = self.amazon_cookies request2.headers['Cookies'] = self.amazon_cookie_header #request2.meta['dont_merge_cookies'] = True request2.meta['query'] = query2 request2.meta['target_site'] = target_site pending_requests = [] if not request: request = request2 else: pending_requests.append(request2) # 3) Search by combinations of words in product's name # create queries for words in ProcessText.words_combinations(product_name, fast=self.fast): query3 = self.build_search_query(" ".join(words)) search_pages3 = self.build_search_pages(query3) #page3 = search_pages3[self.target_site] page3 = search_pages3[target_site] request3 = Request(page3, callback=self.parseResults) # set amazon cookies if (self.target_site == 'amazon' and self.cookies_file): request3.cookies = self.amazon_cookies request3.headers['Cookies'] = self.amazon_cookie_header #request3.meta['dont_merge_cookies'] = True request3.meta['query'] = query3 request3.meta['target_site'] = target_site pending_requests.append(request3) request.meta['pending_requests'] = pending_requests #request.meta['origin_site'] = # product page from source site #TODO: clean this URL? for walmart it added something with ?enlargedsearch=True request.meta['origin_url'] = response.url request.meta['origin_name'] = product_name request.meta['origin_model'] = product_model if product_price: request.meta['origin_price'] = product_price # origin product brand as extracted from name (basically the first word in the name) request.meta['origin_brand_extracted'] = product_brand_extracted # if self.by_id: # request.meta['origin_id'] = self.extract_walmart_id(response.url) #self.target_site = product_brand_extracted #TODO: should this be here?? target_site = product_brand_extracted # print "SENDING REQUEST FOR ", product_name, response.url yield request
def parse(self, response): if self.product_name: # can inly use this option if self.target_site has been initialized (usually true for spiders for retailers sites, not true for manufacturer's sites) if not self.target_site: self.log( "You can't use the product_name option without setting the target site to search on\n", level=log.ERROR) raise CloseSpider( "\nYou can't use the product_name option without setting the target site to search on\n" ) search_query = self.build_search_query(self.product_name) search_pages = self.build_search_pages(search_query) request = Request(search_pages[self.target_site], callback=self.parseResults) # set amazon cookies if (self.target_site == 'amazon' and self.cookies_file): request.cookies = self.amazon_cookies request.headers['Cookies'] = self.amazon_cookie_header #request.meta['dont_merge_cookies'] = True ## print "SET AMAZON COOKIES" request.meta['origin_name'] = self.product_name request.meta['query'] = search_query # just use empty product model and url, for compatibility, also pending_requests request.meta['origin_model'] = '' request.meta['origin_url'] = '' request.meta['pending_requests'] = [] yield request # if we have product URLs, pass them to parseURL to extract product names (which will pass them to parseResults) product_urls = [] # if we have a single product URL, create a list of URLs containing it if self.product_url: product_urls.append(self.product_url) # if we have a file with a list of URLs, create a list with URLs found there if self.product_urls_file: f = open(self.product_urls_file, "r") for line in f: product_urls.append(line.strip()) f.close() for product_url in product_urls: # extract site domain # m = re.match("http://www1?\.([^\.]+)\.com.*", product_url) # origin_site = "" # if m: # origin_site = m.group(1) # else: # sys.stderr.write('Can\'t extract domain from URL.\n') origin_site = Utils.extract_domain(product_url) request = Request(product_url, callback=self.parseURL) request.meta['origin_site'] = origin_site if origin_site == 'staples': zipcode = "12345" request.cookies = {"zipcode": zipcode} request.meta['dont_redirect'] = True yield request # if we have a file with Walmart ids, create a list of the ids there if self.walmart_ids_file: walmart_ids = [] f = open(self.walmart_ids_file, "r") for line in f: if "," in line: id_string = line.strip().split(",")[0] else: id_string = line.strip() if re.match("[0-9]+", id_string): walmart_ids.append(id_string) f.close() self.by_id = True for walmart_id in walmart_ids: # create Walmart URLs based on these IDs walmart_url = Utils.add_domain(walmart_id, "http://www.walmart.com/ip/") request = Request(walmart_url, callback=self.parseURL) #request.meta['origin_site'] = 'walmart' yield request
def next_request(self): ''' Logic to handle getting a new url request, from a bunch of different queues ''' t = time.time() # update the redis queues every so often if t - self.update_time > self.update_interval: self.update_time = t self.create_queues() item = self.find_item() if item: self.logger.info( 'distributed_scheduler.py::DistributedScheduler::next_request call find_item() result is : %s' % ( item["meta"]["url"] if 'meta' in item else item["url"])) self.logger.debug("Found url to crawl {url}" \ .format(url=item['url'])) try: req = Request(item['url']) except ValueError: # need absolute url # need better url validation here req = Request('http://' + item['url']) if 'callback' in item: cb = item['callback'] if cb and self.spider: cb = get_method(self.spider, cb) req.callback = cb if 'errback' in item: eb = item['errback'] if eb and self.spider: eb = get_method(self.spider, eb) req.errback = eb if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in item.keys(): req.meta[key] = item[key] # extra check to add items to request if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], basestring): req.cookies = self.parse_cookie(item['cookie']) return req return None
def parseResults(self, response): hxs = HtmlXPathSelector(response) # print "PARSE AMAZON FOR", response.meta['origin_url'], "RESULTS PAGE", response.url if 'items' in response.meta: items = response.meta['items'] else: items = set() # add product URLs to be parsed to this list if 'search_results' not in response.meta: product_urls = set() else: product_urls = response.meta['search_results'] # get search results for received results page and add them to product_urls to be parsed results = hxs.select("//h3[@class='newaps']/a") for result in results: product_url = result.select("@href").extract()[0] # remove the part after "/ref" containing details about the search query m = re.match("(.*)/ref=(.*)", product_url) if m: product_url = m.group(1) product_url = Utils.add_domain(product_url, "http://www.amazon.com") product_urls.add(product_url) # extract product info from product pages (send request to parse first URL in list) # add as meta all that was received as meta, will pass it on to reduceResults function in the end # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed # send the request further to parse product pages only if we gathered all the product URLs from all the queries # (there are no more pending requests) # otherwise send them back to parseResults and wait for the next query, save all product URLs in search_results # this way we avoid duplicates if product_urls and ('pending_requests' not in response.meta or not response.meta['pending_requests']): request = Request(product_urls.pop(), callback=self.parse_product_amazon, meta=response.meta) if self.cookies_file: request.cookies = self.amazon_cookies request.headers['Cookies'] = self.amazon_cookie_header #request.meta['dont_merge_cookies'] = True request.meta['items'] = items # this will be the new product_urls list with the first item popped request.meta['search_results'] = product_urls return request # if there were no results, the request will never get back to reduceResults # so send it from here so it can parse the next queries # add to the response the URLs of the products to crawl we have so far, items (handles case when it was not created yet) # and field 'parsed' to indicate that the call was received from this method (was not the initial one) else: response.meta['items'] = items response.meta['parsed'] = True response.meta['search_results'] = product_urls # only send the response we have as an argument, no need to make a new request # print "RETURNING TO REDUCE RESULTS", response.meta['origin_url'] return self.reduceResults(response)
def parse_product_amazon(self, response): # print "PARSE AMAZON PRODUCT FOR", response.meta['origin_url'], response.url hxs = HtmlXPathSelector(response) items = response.meta['items'] #site = response.meta['origin_site'] origin_url = response.meta['origin_url'] item = SearchItem() item['product_url'] = response.url #item['origin_site'] = site item['origin_url'] = origin_url item['origin_name'] = response.meta['origin_name'] if 'origin_model' in response.meta: item['origin_model'] = response.meta['origin_model'] # if 'origin_id' in response.meta: # item['origin_id'] = response.meta['origin_id'] # assert self.by_id # else: # assert not self.by_id # extract product name #TODO: id='title' doesn't work for all, should I use a 'contains' or something? # extract titles that are not empty (ignoring whitespace) # eliminate "Amazon Prime Free Trial" #TODO: to test this #product_name = filter(lambda x: not x.startswith("Amazon Prime"), hxs.select("//div[@id='title_feature_div']//h1//text()[normalize-space()!='']").extract()) product_name = filter( lambda x: not x.startswith("Amazon Prime"), hxs.select("//h1//text()[normalize-space()!='']").extract()) if not product_name: # print "NO PRODUCT NAME FOR", response.url self.log("Error: No product name: " + str(response.url) + " for walmart product " + origin_url, level=log.ERROR) # assume there is a captcha to crack # check if there is a form on the page - that means it's probably the captcha form forms = hxs.select("//form") if forms: # solve captcha captcha_text = None image = hxs.select(".//img/@src").extract() if image: captcha_text = self.CB.solve_captcha(image[0]) # value to use if there was an exception if not captcha_text: captcha_text = '' # create a FormRequest to this same URL, with everything needed in meta # items, cookies and search_urls not changed from previous response so no need to set them again # redo the entire request (no items will be lost) return [ FormRequest.from_response( response, callback=self.parse_product_amazon, formdata={'field-keywords': captcha_text}, meta=response.meta) ] else: item['product_name'] = product_name[0].strip() # extract product model number model_number_holder = hxs.select( "//tr[@class='item-model-number']/td[@class='value']/text() | //li/b/text()[normalize-space()='Item model number:']/parent::node()/parent::node()/text()" ).extract() if model_number_holder: item['product_model'] = model_number_holder[0].strip() # if no product model explicitly on the page, try to extract it from name else: product_model_extracted = ProcessText.extract_model_from_name( item['product_name']) if product_model_extracted: item['product_model'] = product_model_extracted ## print "MODEL EXTRACTED: ", product_model_extracted, " FROM NAME ", item['product_name'].encode("utf-8") brand_holder = hxs.select( "//div[@id='brandByline_feature_div']//a/text() | //a[@id='brand']/text()" ).extract() if brand_holder: item['product_brand'] = brand_holder[0] else: pass #sys.stderr.write("Didn't find product brand: " + response.url + "\n") # extract price #! extracting list price and not discount price when discounts available? price_holder = hxs.select("//span[contains(@id,'priceblock')]/text() | //span[@class='a-color-price']/text() " + \ "| //span[@class='listprice']/text() | //span[@id='actualPriceValue']/text() | //b[@class='priceLarge']/text() | //span[@class='price']/text()").extract() # if we can't find it like above try other things: if not price_holder: # prefer new prices to used ones price_holder = hxs.select( "//span[contains(@class, 'olp-new')]//text()[contains(.,'$')]" ).extract() if price_holder: product_target_price = price_holder[0].strip() # remove commas separating orders of magnitude (ex 2,000) product_target_price = re.sub(",", "", product_target_price) m = re.match("\$([0-9]+\.?[0-9]*)", product_target_price) if m: item['product_target_price'] = float(m.group(1)) else: self.log("Didn't match product price: " + product_target_price + " " + response.url + "\n", level=log.WARNING) else: self.log("Didn't find product price: " + response.url + "\n", level=log.INFO) # add result to items items.add(item) # print "STILL IN parse_product FOR", response.url product_urls = response.meta['search_results'] # try to send request to parse next product, try until url for next product url is valid (response not 404) # this is needed because if next product url is not valid, this request will not be sent and all info about this match (stored in request meta) will be lost # find first valid next product url next_product_url = None if product_urls: next_product_url = product_urls.pop() while (product_urls and not self.is_valid_url(next_product_url)): # print "404 FROM", next_product_url next_product_url = product_urls.pop() # handle corner case of bad next product url if not product_urls and next_product_url and not self.is_valid_url( next_product_url): next_product_url = None # if a next product url was found, send new request back to parse_product_url if next_product_url: request = Request(next_product_url, callback=self.parse_product_amazon, meta=response.meta) if self.cookies_file: request.cookies = self.amazon_cookies request.headers['Cookies'] = self.amazon_cookie_header #request.meta['dont_merge_cookies'] = True request.meta['items'] = items # eliminate next product from pending list (this will be the new list with the first item popped) request.meta['search_results'] = product_urls # print "RETURNING FROM PARSE AMAZON PRODUCT TO parse_product FOR", response.meta['origin_url'], response.url, "NEXT IS", next_product_url respcode = urllib.urlopen(next_product_url) return request # if no next valid product url was found else: # we are done, send a the response back to reduceResults (no need to make a new request) # add as meta newly added items # also add 'parsed' field to indicate that the parsing of all products was completed and they cand be further used # (actually that the call was made from this method and was not the initial one, so it has to move on to the next request) response.meta['parsed'] = True response.meta['items'] = items # print "RETURNING FROM PARSE AMAZON PRODUCT TO reduce_results FOR", response.meta['origin_url'], response.url return self.reduceResults(response)