def parseResults(self, response): hxs = HtmlXPathSelector(response) #site = response.meta['origin_site'] origin_name = response.meta['origin_name'] origin_model = response.meta['origin_model'] # if this comes from a previous request, get last request's items and add to them the results if 'items' in response.meta: items = response.meta['items'] else: items = set() results = hxs.select("//div[@class='shortDescription']/a") for result in results: item = SearchItem() #item['origin_site'] = site item['product_name'] = result.select("text()").extract()[0] item['product_url'] = result.select("@href").extract()[0] if 'origin_url' in response.meta: item['origin_url'] = response.meta['origin_url'] items.add(item) response.meta['items'] = items response.meta['parsed'] = items return self.reduceResults(response)
def parse(self, response): sel = Selector(response) items = [] for i in range((self.pages - 1) * 10, self.pages * 10): i += 1 item = SearchItem() url = sel.xpath('//div[@id="%d"]/h3/a/@href' % (i)).extract() title = sel.xpath('//div[@id="%d"]/h3/a//text()' % (i)).extract() desc = sel.xpath( '//div[@id="%d"]//div[@class="c-abstract"]//text()' % (i)).extract() item['num'] = str(i) item['title'] = title if title else '未爬取到标题' item['url'] = url if url else 'www.example.com' item['desc'] = desc if desc else '未爬取到内容' print str(i) + ' ' + ''.join(item['title']) + '\n' yield item next_page = sel.xpath( '//div[@id="page"]/a[@class="n"]/@href').extract() self.pages += 1 if len(next_page) == 2: yield Request(self.domain + next_page[1]) elif ((len(next_page) == 1) and (re.match(r'.*rsv_page=1$', next_page[0]))): yield Request(self.domain + next_page[0]) else: print "Congratulations! All results has crawled!" raise CloseSpider('Happy Ending')
def parse(self, response): self.count += 1 print("\n第%d个网页, 网址%s" % (self.count, response.url)) # 检查本页的内容,是否含有关键词。 ls = response.xpath('//text()').extract() mark = 0 for s in ls: if 'кита' in s.lower(): mark = 1 break # 如果mark == 1,则说明本页上有关键词。 if mark == 1: item = SearchItem() item['site'] = response.url yield item # 如果第一级url超过5个,就不再找了,只找下级的url。 if self.count < 3: next_pages = response.xpath('//a/@href').extract() if len(next_pages) > 0: for link in next_pages: # 如果是http开头,则直接使用该链接,是外网链接。 if re.match('http', link): yield scrapy.Request(link, callback=self.parse) # 如果不是http开头,则需要补充前缀,是本站链接。 else: yield scrapy.Request('https://yandex.ru' + link, callback=self.parse)
def extract_result_products(self, response): hxs = HtmlXPathSelector(response) results = hxs.select("//div[@class='innerWrapper']") items = [] for result in results: item = SearchItem() product_name = result.select( ".//div[@class='shortDescription']/a/text()").extract() product_url = result.select( ".//div[@class='shortDescription']/a/@href").extract() # quit if there is no product name if product_name and product_url: item['product_url'] = "http://www1.macys.com" + product_url[0] item['product_name'] = product_name[0].strip() else: self.log("No product name: " + str(response.url) + " from product: " + response.meta['origin_url'], level=log.ERROR) continue # extract price #! extracting regular price and not discount price when discounts available? price_holder = result.select( "div[@class='prices']/span/text()").extract() if price_holder: product_target_price = price_holder[0].strip() # remove commas separating orders of magnitude (ex 2,000) product_target_price = re.sub(",", "", product_target_price) # if more than one match, it will get the first one m = re.match("([a-zA-Z\.\s]+)?(\xa3|\$)([0-9]+\.?[0-9]*)", product_target_price) if m: price = float(m.group(3)) currency = m.group(2) item['product_target_price'] = Utils.convert_to_dollars( price, currency) else: self.log("Didn't match product price: " + product_target_price + " " + response.url + "\n", level=log.WARNING) else: self.log("Didn't find product price: " + response.url + "\n", level=log.DEBUG) # extract product brand # items.append(item) return items
def start_requests(self): while True: code = self.kw.pop() code = code.strip('\n') item = SearchItem() item['keyword'] = code sic_url = 'http://www.alexa.com/siteinfo/%s' % code yield Request(url=sic_url, meta={"item": item['keyword']}, callback=self.parse0)
def parse_product_sony(self, response): hxs = HtmlXPathSelector(response) items = response.meta['items'] #site = response.meta['origin_site'] origin_url = response.meta['origin_url'] # create item item = SearchItem() item['product_url'] = response.url item['origin_url'] = origin_url # hardcode brand to sony item['product_brand'] = 'sony' # extract product name, brand, model, etc; add to items product_name = hxs.select("//h2[@class='ws-product-title fn']//text()") if not product_name: self.log("Error: No product name: " + str(response.url), level=log.INFO) else: item['product_name'] = product_name.extract()[0] product_model = hxs.select( "//span[@class='ws-product-item-number-value item-number']/text()") if product_model: item['product_model'] = product_model.extract()[0] item['product_images'] = len( hxs.select( "//a[@class='ws-alternate-views-list-link']/img").extract()) item['product_videos'] = len( hxs.select("//li[@class='ws-video']//img").extract()) items.add(item) # if there are any more results to be parsed, send a request back to this method with the next product to be parsed product_urls = response.meta['search_results'] if product_urls: request = Request(product_urls.pop(), callback=self.parse_product_sony, meta=response.meta) request.meta['items'] = items # eliminate next product from pending list (this will be the new list with the first item popped) request.meta['search_results'] = product_urls return request else: # otherwise, we are done, send a the response back to reduceResults (no need to make a new request) response.meta['parsed'] = True response.meta['items'] = items return self.reduceResults(response)
def parse_product_maplin(self, response): hxs = HtmlXPathSelector(response) items = response.meta['items'] #site = response.meta['origin_site'] origin_url = response.meta['origin_url'] item = SearchItem() item['product_url'] = response.url #item['origin_site'] = site item['origin_url'] = origin_url item['origin_name'] = response.meta['origin_name'] if 'origin_model' in response.meta: item['origin_model'] = response.meta['origin_model'] if 'origin_upc' in response.meta: item['origin_upc'] = response.meta['origin_upc'] if 'origin_brand' in response.meta: item['origin_brand'] = response.meta['origin_brand'] product_name_node = hxs.select("//h1[@itemprop='name']/text()").extract() if product_name_node: product_name = product_name_node[0].strip() else: self.log("Error: No product name: " + str(response.url) + " for source product " + origin_url, level=log.ERROR) # TODO:is this ok? I think so return item['product_name'] = product_name # extract product model number # TODO: no model? # TODO: no upc? # TODO: no brand? # TODO: add code extraction # extract price price_holder = hxs.select("//meta[@itemprop='price']/@content").extract() # if we can't find it like above try other things: if price_holder: product_target_price = price_holder[0].strip() # remove commas separating orders of magnitude (ex 2,000) product_target_price = re.sub(",","",product_target_price) try: product_target_price = float(product_target_price) # convert to dollars (assume pounds) product_target_price = Utils.convert_to_dollars(product_target_price, u'\xa3') item['product_target_price'] = product_target_price except Exception, ex: self.log("Couldn't convert product price: " + response.url + "\n", level=log.WARNING)
def extract_result_products(self, response): hxs = HtmlXPathSelector(response) items = [] results = hxs.select( "//div[@class='list-item-info']/div[@class='sku-title']/h4/a") for result in results: item = SearchItem() #item['origin_site'] = site product_name_holder = result.select("text()").extract() if product_name_holder: item['product_name'] = product_name_holder[0].strip() else: self.log("Error: No product name: " + str(response.url) + " from product: " + origin_url, level=log.ERROR) item['product_url'] = Utils.clean_url( Utils.add_domain( result.select("@href").extract()[0], "http://www.bestbuy.com")) if 'origin_url' in response.meta: item['origin_url'] = response.meta['origin_url'] if 'origin_name' in response.meta: item['origin_name'] = response.meta['origin_name'] if 'origin_model' in response.meta: item['origin_model'] = response.meta['origin_model'] model_holder = result.select( "../../../div[@class='sku-model']/ul/li[@class='model-number']/span[@id='model-value']/text()" ).extract() if model_holder: item['product_model'] = model_holder[0] price_holder = result.select( "../../../../div[@class='list-item-price']//div[@class='price-block']//div[@class='medium-item-price']/text()[normalize-space()]" ).extract() if price_holder: price = price_holder[0].strip() price = re.sub(",", "", price) price = float(price) item['product_target_price'] = price items.append(item) return items
def parseResults(self, response): hxs = HtmlXPathSelector(response) #site = response.meta['origin_site'] origin_name = response.meta['origin_name'] origin_model = response.meta['origin_model'] # if this comes from a previous request, get last request's items and add to them the results if 'items' in response.meta: items = response.meta['items'] else: items = set() results = hxs.select( "//div[@class='prodInfo']/div[@class='prodInfoBox']/a[@class='prodLink ListItemLink']" ) for result in results: item = SearchItem() #item['origin_site'] = site #TODO: usually the manufacturer is in bold, so maybe use that product_name = " ".join(result.select(".//text()").extract()) # append text that is in <span> if any span_text = result.select("./span/text()") #TODO: use span text differently, as it is more important/relevant (bold) ? for text in span_text: product_name += " " + text.extract() item['product_name'] = product_name rel_url = result.select("@href").extract()[0] root_url = "http://www.walmart.com" item['product_url'] = Utils.add_domain(rel_url, root_url) if 'origin_url' in response.meta: item['origin_url'] = response.meta['origin_url'] if 'origin_id' in response.meta: request.meta['origin_id'] = response.meta['origin_id'] assert self.by_id else: assert not self.by_id items.add(item) response.meta['items'] = items response.meta['parsed'] = items return self.reduceResults(response)
def parseResults(self, response): hxs = HtmlXPathSelector(response) #site = response.meta['origin_site'] origin_name = response.meta['origin_name'] origin_model = response.meta['origin_model'] # if this comes from a previous request, get last request's items and add to them the results if 'items' in response.meta: items = response.meta['items'] else: items = set() results = hxs.select( "//div[@class='hproduct']/div[@class='info-main']/h3/a") for result in results: item = SearchItem() #item['origin_site'] = site item['product_name'] = result.select("text()").extract()[0].strip() item['product_url'] = Utils.clean_url( Utils.add_domain( result.select("@href").extract()[0], "http://www.bestbuy.com")) if 'origin_url' in response.meta: item['origin_url'] = response.meta['origin_url'] if 'origin_id' in response.meta: request.meta['origin_id'] = response.meta['origin_id'] # assert self.by_id # else: # assert not self.by_id model_holder = result.select( "parent::node()/parent::node()//strong[@itemprop='model']/text()" ).extract() if model_holder: item['product_model'] = model_holder[0] items.add(item) response.meta['items'] = items response.meta['parsed'] = items return self.reduceResults(response)
def parse0(self, response): item = SearchItem() selector = Selector(response) text0 = selector.xpath( '//div/strong[@class="metrics-data align-vmiddle"]/text()' ).extract()[1].strip() text1 = selector.xpath( '//span[@class="font-4 box1-r"]/text()').extract()[0] item['result'] = text0 item['keyword'] = response.meta['item'] item['link'] = text1 yield item # class Spider(CrawlSpider): # name="search" # with open(r'input', 'r') as r: # f=r.readlines() # kw=set(f) # finished=set() # def start_requests(self): # while self.kw.__len__(): # code=self.kw.pop() # self.finished.add(code) # code=code.strip('\n') # item=SearchItem() # item['keyword']=code # sic_url='https://www.google.com/search?q=%s'%code # yield Request(url=sic_url,meta={"item":item['keyword']},callback=self.parse0) # # def parse0(self,response): # item = SearchItem() # selector = Selector(response) # text0 = selector.xpath('//div[@id="resultStats"]/text()').extract() # text0 = ''.join(text0) # num = re.findall(r'\d+', text0) # item['result'] = ''.join(num) # item['keyword'] = response.meta['item'] # if item['keyword'] not in self.finished: # self.kw.add(item['keyword']) # # # yield item
def parseResults(self, response): hxs = HtmlXPathSelector(response) #site = response.meta['origin_site'] origin_name = response.meta['origin_name'] origin_model = response.meta['origin_model'] # if this comes from a previous request, get last request's items and add to them the results if 'items' in response.meta: items = response.meta['items'] else: items = set() results = hxs.select("//li[@class='productbox']") for result in results: product_link = result.select(".//a[@class='toplink']") item = SearchItem() #item['origin_site'] = site #TODO: site changed structure? item['product_url'] = product_link.select("@href").extract()[0] item['product_name'] = product_link.select( "div[@class='prodname']/text()").extract()[0] #TODO: add brand? #item['brand'] = result.select("div[@class='prodname']/div[@class='prodbrandname emphasis]/text()").extract()[0] if 'origin_url' in response.meta: item['origin_url'] = response.meta['origin_url'] if 'origin_id' in response.meta: request.meta['origin_id'] = response.meta['origin_id'] assert self.by_id else: assert not self.by_id items.add(item) response.meta['items'] = items response.meta['parsed'] = items return self.reduceResults(response)
def parseResults(self, response): hxs = HtmlXPathSelector(response) #site = response.meta['origin_site'] origin_name = response.meta['origin_name'] origin_model = response.meta['origin_model'] # if this comes from a previous request, get last request's items and add to them the results if 'items' in response.meta: items = response.meta['items'] else: items = set() # toysrus results = hxs.select("//a[@class='prodtitle']") for result in results: item = SearchItem() #item['origin_site'] = site item['product_name'] = result.select("text()").extract()[0] root_url = "http://www.toysrus.com" item['product_url'] = root_url + result.select( "@href").extract()[0] if 'origin_url' in response.meta: item['origin_url'] = response.meta['origin_url'] if 'origin_id' in response.meta: request.meta['origin_id'] = response.meta['origin_id'] assert self.by_id else: assert not self.by_id items.add(item) response.meta['items'] = items response.meta['parsed'] = items return self.reduceResults(response)
def parseResults(self, response): hxs = HtmlXPathSelector(response) #site = response.meta['origin_site'] origin_name = response.meta['origin_name'] origin_model = response.meta['origin_model'] # if this comes from a previous request, get last request's items and add to them the results if 'items' in response.meta: items = response.meta['items'] else: items = set() results = hxs.select( "//li[@class='product']/div[@class='product-content']/a[@class='pro-thumb']" ) for result in results: item = SearchItem() #item['origin_site'] = site item['product_name'] = result.select( "span[@class='pro-name']/text()").extract()[0] item['product_url'] = result.select("@href").extract()[0] if 'origin_url' in response.meta: item['origin_url'] = response.meta['origin_url'] if 'origin_id' in response.meta: request.meta['origin_id'] = response.meta['origin_id'] assert self.by_id else: assert not self.by_id items.add(item) response.meta['items'] = items response.meta['parsed'] = items return self.reduceResults(response)
def parse_item(self, response): item = self.config.get('item') if item: # cls = eval(item.get('class'))() # loader = eval(item.get('loader'))(cls, response=response) loader = SearchLoader(item=SearchItem(), response=response) loader.add_value('tenantId', self.tenantId) loader.add_value('indexName', self.indexName) loader.add_value('dataAnnotation', self.dataAnnotation) # 格式化成2016-03-20 11:45:39形式 # loader.add_value('createDate', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) loader.add_value( 'createDate', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) # 动态获取属性配置 for key, value in item.get( 'attrs').items(): #attrs是json,json——>items for extractor in value: #value:数组,extractor:json if extractor.get('method') == 'xpath': args = extractor.get('args') #数组 if key == 'title': args = ['normalize-space(' + self.title + ')'] elif key == 'content': args = [self.content] loader.add_xpath(key, *args, **{'re': extractor.get('re')}) if extractor.get('method') == 'css': loader.add_css(key, *extractor.get('args'), **{'re': extractor.get('re')}) if extractor.get('method') == 'value': loader.add_value(key, *extractor.get('args'), **{'re': extractor.get('re')}) if extractor.get('method') == 'attr': loader.add_value( key, getattr(response, *extractor.get('args'))) yield loader.load_item()
def parse_product_ebay(self, response): hxs = HtmlXPathSelector(response) items = response.meta['items'] #site = response.meta['origin_site'] origin_url = response.meta['origin_url'] item = SearchItem() item['product_url'] = response.url #item['origin_site'] = site item['origin_url'] = origin_url if 'origin_id' in response.meta: item['origin_id'] = response.meta['origin_id'] assert self.by_id else: assert not self.by_id # extract product name product_name = hxs.select("//h1[@id='itemTitle']/text()").extract() if not product_name: self.log("Error: No product name: " + str(response.url), level=log.INFO) else: item['product_name'] = product_name[0] # extract product brand product_brand_holder = hxs.select("//td[@class='attrLabels'][contains(normalize-space(),'Brand')]" + \ "/following-sibling::node()[normalize-space()!=''][1]//text()[normalize-space()!='']").extract() if product_brand_holder: item['product_brand'] = product_brand_holder[0] # extract product model product_model_holder = hxs.select("//td[@class='attrLabels'][contains(normalize-space(),'Model')]" + \ "/following-sibling::node()[normalize-space()!=''][1]//text()[normalize-space()!='']").extract() if not product_model_holder: product_model_holder = hxs.select("//td[@class='attrLabels'][contains(normalize-space(),'MPN')]" + \ "/following-sibling::node()[normalize-space()!=''][1]//text()[normalize-space()!='']").extract() if product_model_holder: item['product_model'] = product_model_holder[0] # add result to items items.add(item) # if there are any more results to be parsed, send a request back to this method with the next product to be parsed product_urls = response.meta['search_results'] if product_urls: request = Request(product_urls.pop(), callback=self.parse_product_ebay, meta=response.meta) request.meta['items'] = items # eliminate next product from pending list (this will be the new list with the first item popped) request.meta['search_results'] = product_urls return request else: # otherwise, we are done, send a the response back to reduceResults (no need to make a new request) # add as meta newly added items # also add 'parsed' field to indicate that the parsing of all products was completed and they cand be further used # (actually that the call was made from this method and was not the initial one, so it has to move on to the next request) response.meta['parsed'] = True response.meta['items'] = items return self.reduceResults(response)
def parseURL(self, response): site = response.meta['origin_site'] hxs = HtmlXPathSelector(response) product_model = "" product_brand = "" product_price = "" #############################################################3 # Extract product attributes (differently depending on site) if site == 'staples': product_name = hxs.select("//h1/text()").extract()[0] model_nodes = hxs.select( "//p[@class='itemModel']/text()").extract() if model_nodes: model_node = model_nodes[0] model_node = re.sub("\W", " ", model_node, re.UNICODE) m = re.match("(.*)Model:(.*)", model_node.encode("utf-8"), re.UNICODE) if m: product_model = m.group(2).strip() elif site == 'walmart': product_name_holder = hxs.select( "//h1[@class='productTitle']/text()").extract() if product_name_holder: product_name = product_name_holder[0].strip() # get integer part of product price product_price_big = hxs.select( "//span[@class='bigPriceText1']/text()").extract() if not product_price_big: self.log("Didn't find product price: " + response.url + "\n", level=log.DEBUG) # if there is a range of prices take their average if len(product_price_big) > 1: # remove $ and . product_price_min = re.sub("[\$\.,]", "", product_price_big[0]) product_price_max = re.sub("[\$\.,]", "", product_price_big[-1]) #TODO: check if they're ints? product_price_big = (int(product_price_min) + int(product_price_max)) / 2.0 elif product_price_big: product_price_big = int( re.sub("[\$\.,]", "", product_price_big[0])) # get fractional part of price #TODO - not that important if product_price_big: product_price = product_price_big else: sys.stderr.write( "Broken product page link (can't find item title): " + response.url + "\n") # return the item as a non-matched item item = SearchItem() #item['origin_site'] = site item['origin_url'] = response.url # remove unnecessary parameters m = re.match("(.*)\?enlargedSearch.*", item['origin_url']) if m: item['origin_url'] = m.group(1) #item['origin_id'] = self.extract_walmart_id(item['origin_url']) if self.name != 'manufacturer': # don't return empty matches in manufacturer spider yield item return #TODO: if it contains 2 words, first could be brand - also add it in similar_names function product_model_holder = hxs.select( "//td[contains(text(),'Model')]/following-sibling::*/text()" ).extract() if product_model_holder: product_model = product_model_holder[0] #TODO: for the sites below, complete with missing logic, for not returning empty elements in manufacturer spider elif site == 'newegg': product_name_holder = hxs.select( "//span[@itemprop='name']/text()").extract() if product_name_holder: product_name = product_name_holder[0].strip() else: sys.stderr.write( "Broken product page link (can't find item title): " + response.url + "\n") item = SearchItem() #item['origin_site'] = site item['origin_url'] = response.url yield item return product_model_holder = hxs.select( "//dt[text()='Model']/following-sibling::*/text()").extract() if product_model_holder: product_model = product_model_holder[0] else: raise CloseSpider("Unsupported site: " + site) if site == 'staples': zipcode = "12345" cookies = {"zipcode": zipcode} else: cookies = {} ####################################################################### # Create search queries to the second site, based on product attributes request = None #TODO: search by alternative model numbers? #TODO: search by model number extracted from product name? Don't I do that implicitly? no, but in combinations # if there is no product model, try to extract it if not product_model: product_model = ProcessText.extract_model_from_name(product_name) # for logging purposes, set this back to the empty string if it wasn't found (so was None) if not product_model: product_model = "" # product_model_index = ProcessText.extract_model_nr_index(product_name) # if product_model_index >= 0: # product_model = product_name[product_model_index] ## print "MODEL EXTRACTED: ", product_model, " FROM NAME ", product_name # if there is no product brand, get first word in name, assume it's the brand product_brand_extracted = "" #product_name_tokenized = ProcessText.normalize(product_name) product_name_tokenized = [ word.lower() for word in product_name.split(" ") ] #TODO: maybe extract brand as word after 'by', if 'by' is somewhere in the product name if len(product_name_tokenized) > 0 and re.match( "[a-z]*", product_name_tokenized[0]): product_brand_extracted = product_name_tokenized[0].lower() # if we are in manufacturer spider, set target_site to manufacturer site # for manufacturer spider set target_site of request to brand extracted from name for this particular product if self.name == 'manufacturer': #TODO: restore commented code; if brand not found, try to search for it on every manufacturer site (build queries fo every supported site) # hardcode target site to sony #self.target_site = 'sony' #self.target_site = product_brand_extracted #target_site = product_brand_extracted # can only go on if site is supported # (use dummy query) #if target_site not in self.build_search_pages("").keys(): if product_brand_extracted not in self.build_search_pages( "").keys(): product_brands_extracted = set( self.build_search_pages("").keys()).intersection( set(product_name_tokenized)) if product_brands_extracted: product_brand_extracted = product_brands_extracted.pop() #target_site = product_brand_extracted else: # give up and return item without match self.log( "Manufacturer site not supported (" + product_brand_extracted + ") or not able to extract brand from product name (" + product_name + ")\n", level=log.ERROR) ## comment lines below to: don't return anything if you can't search on manufacturer site # item = SearchItem() # item['origin_url'] = response.url # item['origin_name'] = product_name # if product_model: # item['origin_model'] = product_model # yield item return # if specific site is not set, search on manufacturer site as extracted from name if not self.manufacturer_site: target_site = product_brand_extracted else: # if it's set, continue only if it matches extracted brand if self.manufacturer_site != product_brand_extracted: self.log( "Will abort matching for product, extracted brand does not match specified manufacturer option (" + product_brand_extracted + ")\n", level=log.INFO) ## comment lines below to: don't return anything if you can't search on manufacturer site # item = SearchItem() # item['origin_url'] = response.url # item['origin_name'] = product_name # if product_model: # item['origin_model'] = product_model # yield item return else: target_site = product_brand_extracted # # try to match it without specific site (manufacturer spider will try to search on all manufacturer sites) # target_site = None # for other (site specific) spiders, set target_site of request to class variable self.target_site set in class "constructor" (init_sub) else: target_site = self.target_site # 1) Search by model number if product_model: #TODO: model was extracted with ProcessText.extract_model_from_name(), without lowercasing, should I lowercase before adding it to query? query1 = self.build_search_query(product_model) search_pages1 = self.build_search_pages(query1) #page1 = search_pages1[self.target_site] page1 = search_pages1[target_site] request1 = Request(page1, callback=self.parseResults) # set amazon cookies if (self.target_site == 'amazon' and self.cookies_file): request1.cookies = self.amazon_cookies request1.headers['Cookies'] = self.amazon_cookie_header #request1.meta['dont_merge_cookies'] = True ## print "SET AMAZON COOKIES" request1.meta['query'] = query1 request1.meta['target_site'] = target_site request = request1 # 2) Search by product full name query2 = self.build_search_query(product_name) search_pages2 = self.build_search_pages(query2) #page2 = search_pages2[self.target_site] page2 = search_pages2[target_site] request2 = Request(page2, callback=self.parseResults) # set cookies for amazon if (self.target_site == 'amazon' and self.cookies_file): request2.cookies = self.amazon_cookies request2.headers['Cookies'] = self.amazon_cookie_header #request2.meta['dont_merge_cookies'] = True request2.meta['query'] = query2 request2.meta['target_site'] = target_site pending_requests = [] if not request: request = request2 else: pending_requests.append(request2) # 3) Search by combinations of words in product's name # create queries for words in ProcessText.words_combinations(product_name, fast=self.fast): query3 = self.build_search_query(" ".join(words)) search_pages3 = self.build_search_pages(query3) #page3 = search_pages3[self.target_site] page3 = search_pages3[target_site] request3 = Request(page3, callback=self.parseResults) # set amazon cookies if (self.target_site == 'amazon' and self.cookies_file): request3.cookies = self.amazon_cookies request3.headers['Cookies'] = self.amazon_cookie_header #request3.meta['dont_merge_cookies'] = True request3.meta['query'] = query3 request3.meta['target_site'] = target_site pending_requests.append(request3) request.meta['pending_requests'] = pending_requests #request.meta['origin_site'] = # product page from source site #TODO: clean this URL? for walmart it added something with ?enlargedsearch=True request.meta['origin_url'] = response.url request.meta['origin_name'] = product_name request.meta['origin_model'] = product_model if product_price: request.meta['origin_price'] = product_price # origin product brand as extracted from name (basically the first word in the name) request.meta['origin_brand_extracted'] = product_brand_extracted # if self.by_id: # request.meta['origin_id'] = self.extract_walmart_id(response.url) #self.target_site = product_brand_extracted #TODO: should this be here?? target_site = product_brand_extracted # print "SENDING REQUEST FOR ", product_name, response.url yield request
def parse(self, response): data = json.loads(str(response.text)) if 1 == data['ok']: card_list = data['data']['cards'] for item in card_list: if "card_type_name" in item and "微博" == item["card_type_name"]: _data = self.__get_blog(item) data_item = ItemLoader(item=SearchItem(), response=response) data_item.add_value("user_name", _data['user_name']) data_item.add_value("user_id", _data['user_id']) data_item.add_value("user_verified_reason", _data['user_verified_reason']) data_item.add_value("user_description", _data['user_description']) data_item.add_value("user_followers_count", _data['user_followers_count']) data_item.add_value("user_statuses_count", _data['user_statuses_count']) data_item.add_value("reposts_count", _data['reposts_count']) data_item.add_value("comments_count", _data['comments_count']) data_item.add_value("attitudes_count", _data['attitudes_count']) data_item.add_value("user_content", _data['user_content']) data_item.add_value("created_at", _data['created_at']) data_item.add_value("source", _data['source']) data_item.add_value("mid", _data['mid']) data_item.add_value("idstr", _data['idstr']) data_item.add_value("user_pics", _data['user_pics']) yield data_item.load_item() # 是否爬去回复 if self.is_get_reply: reply_first_url = get_reply_url(id_str=_data['idstr']) meta = { "idstr": _data["idstr"], "url": reply_first_url } yield Request(url=reply_first_url, callback=self.parse_reply, meta=meta) else: if "card_group" in item: for it in item["card_group"]: if "mblog" in it: _data = self.__get_blog(it) data_item = ItemLoader(item=SearchItem(), response=response) data_item.add_value("user_name", _data['user_name']) data_item.add_value("user_id", _data['user_id']) data_item.add_value( "user_verified_reason", _data['user_verified_reason']) data_item.add_value("user_description", _data['user_description']) data_item.add_value( "user_followers_count", _data['user_followers_count']) data_item.add_value( "user_statuses_count", _data['user_statuses_count']) data_item.add_value("reposts_count", _data['reposts_count']) data_item.add_value("comments_count", _data['comments_count']) data_item.add_value("attitudes_count", _data['attitudes_count']) data_item.add_value("user_content", _data['user_content']) data_item.add_value("created_at", _data['created_at']) data_item.add_value("source", _data['source']) data_item.add_value("mid", _data['mid']) data_item.add_value("idstr", _data['idstr']) data_item.add_value("user_pics", _data['user_pics']) yield data_item.load_item() # 是否爬去回复 if self.is_get_reply: reply_first_url = get_reply_url( id_str=_data['idstr']) meta = { "idstr": _data["idstr"], "url": reply_first_url } yield Request(url=reply_first_url, callback=self.parse_reply, meta=meta) else: # print("no_1_data") pass else: # print("no_data") pass else: print("content interface error, ", "\n", data["msg"]) if 1 > self.page: self.page += 1 next_url = get_search_url(keyword=self.keyword, page=self.page) yield Request(next_url, callback=self.parse)
def parse_product_amazon(self, response): # print "PARSE AMAZON PRODUCT FOR", response.meta['origin_url'], response.url hxs = HtmlXPathSelector(response) items = response.meta['items'] #site = response.meta['origin_site'] origin_url = response.meta['origin_url'] item = SearchItem() item['product_url'] = response.url #item['origin_site'] = site item['origin_url'] = origin_url item['origin_name'] = response.meta['origin_name'] if 'origin_model' in response.meta: item['origin_model'] = response.meta['origin_model'] # if 'origin_id' in response.meta: # item['origin_id'] = response.meta['origin_id'] # assert self.by_id # else: # assert not self.by_id # extract product name #TODO: id='title' doesn't work for all, should I use a 'contains' or something? # extract titles that are not empty (ignoring whitespace) # eliminate "Amazon Prime Free Trial" #TODO: to test this #product_name = filter(lambda x: not x.startswith("Amazon Prime"), hxs.select("//div[@id='title_feature_div']//h1//text()[normalize-space()!='']").extract()) product_name = filter( lambda x: not x.startswith("Amazon Prime"), hxs.select("//h1//text()[normalize-space()!='']").extract()) if not product_name: # print "NO PRODUCT NAME FOR", response.url self.log("Error: No product name: " + str(response.url) + " for walmart product " + origin_url, level=log.ERROR) # assume there is a captcha to crack # check if there is a form on the page - that means it's probably the captcha form forms = hxs.select("//form") if forms: # solve captcha captcha_text = None image = hxs.select(".//img/@src").extract() if image: captcha_text = self.CB.solve_captcha(image[0]) # value to use if there was an exception if not captcha_text: captcha_text = '' # create a FormRequest to this same URL, with everything needed in meta # items, cookies and search_urls not changed from previous response so no need to set them again # redo the entire request (no items will be lost) return [ FormRequest.from_response( response, callback=self.parse_product_amazon, formdata={'field-keywords': captcha_text}, meta=response.meta) ] else: item['product_name'] = product_name[0].strip() # extract product model number model_number_holder = hxs.select( "//tr[@class='item-model-number']/td[@class='value']/text() | //li/b/text()[normalize-space()='Item model number:']/parent::node()/parent::node()/text()" ).extract() if model_number_holder: item['product_model'] = model_number_holder[0].strip() # if no product model explicitly on the page, try to extract it from name else: product_model_extracted = ProcessText.extract_model_from_name( item['product_name']) if product_model_extracted: item['product_model'] = product_model_extracted ## print "MODEL EXTRACTED: ", product_model_extracted, " FROM NAME ", item['product_name'].encode("utf-8") brand_holder = hxs.select( "//div[@id='brandByline_feature_div']//a/text() | //a[@id='brand']/text()" ).extract() if brand_holder: item['product_brand'] = brand_holder[0] else: pass #sys.stderr.write("Didn't find product brand: " + response.url + "\n") # extract price #! extracting list price and not discount price when discounts available? price_holder = hxs.select("//span[contains(@id,'priceblock')]/text() | //span[@class='a-color-price']/text() " + \ "| //span[@class='listprice']/text() | //span[@id='actualPriceValue']/text() | //b[@class='priceLarge']/text() | //span[@class='price']/text()").extract() # if we can't find it like above try other things: if not price_holder: # prefer new prices to used ones price_holder = hxs.select( "//span[contains(@class, 'olp-new')]//text()[contains(.,'$')]" ).extract() if price_holder: product_target_price = price_holder[0].strip() # remove commas separating orders of magnitude (ex 2,000) product_target_price = re.sub(",", "", product_target_price) m = re.match("\$([0-9]+\.?[0-9]*)", product_target_price) if m: item['product_target_price'] = float(m.group(1)) else: self.log("Didn't match product price: " + product_target_price + " " + response.url + "\n", level=log.WARNING) else: self.log("Didn't find product price: " + response.url + "\n", level=log.INFO) # add result to items items.add(item) # print "STILL IN parse_product FOR", response.url product_urls = response.meta['search_results'] # try to send request to parse next product, try until url for next product url is valid (response not 404) # this is needed because if next product url is not valid, this request will not be sent and all info about this match (stored in request meta) will be lost # find first valid next product url next_product_url = None if product_urls: next_product_url = product_urls.pop() while (product_urls and not self.is_valid_url(next_product_url)): # print "404 FROM", next_product_url next_product_url = product_urls.pop() # handle corner case of bad next product url if not product_urls and next_product_url and not self.is_valid_url( next_product_url): next_product_url = None # if a next product url was found, send new request back to parse_product_url if next_product_url: request = Request(next_product_url, callback=self.parse_product_amazon, meta=response.meta) if self.cookies_file: request.cookies = self.amazon_cookies request.headers['Cookies'] = self.amazon_cookie_header #request.meta['dont_merge_cookies'] = True request.meta['items'] = items # eliminate next product from pending list (this will be the new list with the first item popped) request.meta['search_results'] = product_urls # print "RETURNING FROM PARSE AMAZON PRODUCT TO parse_product FOR", response.meta['origin_url'], response.url, "NEXT IS", next_product_url respcode = urllib.urlopen(next_product_url) return request # if no next valid product url was found else: # we are done, send a the response back to reduceResults (no need to make a new request) # add as meta newly added items # also add 'parsed' field to indicate that the parsing of all products was completed and they cand be further used # (actually that the call was made from this method and was not the initial one, so it has to move on to the next request) response.meta['parsed'] = True response.meta['items'] = items # print "RETURNING FROM PARSE AMAZON PRODUCT TO reduce_results FOR", response.meta['origin_url'], response.url return self.reduceResults(response)
def parseResults(self, response): hxs = HtmlXPathSelector(response) if 'items' in response.meta: items = response.meta['items'] else: items = set() results = hxs.select( "//ul[@class='products']//div[@class='product ']//h3//a") for result in results: item = SearchItem() product_url = result.select("@href").extract()[0] if result.select( "@href") else None product_name = result.select( "@title").extract()[0] if result.select("@title") else None # assert name is not abbreviated # empirically, this only seems to produce false positives, so removed # assert '...' not in product_name # quit if there is no product name if product_name and product_url: # clean url item['product_url'] = Utils.add_domain(product_url, self.base_url) item['product_name'] = product_name else: self.log("No product name: " + str(response.url) + " from product: " + response.meta['origin_url'], level=log.ERROR) continue # add url, name and model of product to be matched (from origin site) item['origin_url'] = response.meta['origin_url'] item['origin_name'] = response.meta['origin_name'] if 'origin_model' in response.meta: item['origin_model'] = response.meta['origin_model'] # extract product model from name product_model_extracted = ProcessText.extract_model_from_name( item['product_name']) if product_model_extracted: item['product_model'] = product_model_extracted #TODO: extract: price, brand? # add result to items items.add(item) # extract product info from product pages (send request to parse first URL in list) # add as meta all that was received as meta, will pass it on to reduceResults function in the end # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed # send the request back to reduceResults (with updated 'items') whether there are any more pending requests or not # if there are, reduceResults will send the next one back here, if not it will return the final result response.meta['items'] = items # and field 'parsed' to indicate that the call was received from this method (was not the initial one) #TODO: do we still need this? response.meta['parsed'] = True # only send the response we have as an argument, no need to make a new request return self.reduceResults(response)
def parse_product_currys(self, response): hxs = HtmlXPathSelector(response) items = response.meta['items'] #site = response.meta['origin_site'] origin_url = response.meta['origin_url'] item = SearchItem() item['product_url'] = response.url #item['origin_site'] = site item['origin_url'] = origin_url item['origin_name'] = response.meta['origin_name'] if 'origin_model' in response.meta: item['origin_model'] = response.meta['origin_model'] if 'origin_upc' in response.meta: item['origin_upc'] = response.meta['origin_upc'] if 'origin_brand' in response.meta: item['origin_brand'] = response.meta['origin_brand'] product_name_node = hxs.select("//span[@itemprop='name']/text()").extract() if product_name_node: product_name = product_name_node[0].strip() else: self.log("Error: No product name: " + str(response.url) + " for source product " + origin_url, level=log.ERROR) # TODO:is this ok? I think so return item['product_name'] = product_name # extract product model number # TODO: no model? # TODO: no upc? brand_holder = hxs.select("//span[@itemprop='name']/text()").extract() if brand_holder: item['product_brand'] = brand_holder[0] # extract price price_holder = hxs.select("//span[@class='currentPrice']/ins/text()").extract() # if we can't find it like above try other things: if price_holder: product_target_price = price_holder[0].strip() # remove commas separating orders of magnitude (ex 2,000) product_target_price = re.sub(",","",product_target_price) m = re.match("(\xa3)([0-9]+\.?[0-9]*)", product_target_price) if m: item['product_target_price'] = float(m.group(2)) currency = m.group(1) item['product_target_price'] = Utils.convert_to_dollars(item['product_target_price'], currency) else: self.log("Didn't match product price: " + product_target_price + " " + response.url + "\n", level=log.WARNING) else: self.log("Didn't find product price: " + response.url + "\n", level=log.INFO) # add result to items items.add(item) product_urls = response.meta['search_results'] # try to send request to parse next product, try until url for next product url is valid (response not 404) # this is needed because if next product url is not valid, this request will not be sent and all info about this match (stored in request meta) will be lost # find first valid next product url next_product_url = None if product_urls: next_product_url = product_urls.pop() # if a next product url was found, send new request back to parse_product_url if next_product_url: request = Request(next_product_url, callback = self.parse_product_currys, meta = response.meta) request.meta['items'] = items # eliminate next product from pending list (this will be the new list with the first item popped) request.meta['search_results'] = product_urls return request # if no next valid product url was found else: # we are done, send a the response back to reduceResults (no need to make a new request) # add as meta newly added items # also add 'parsed' field to indicate that the parsing of all products was completed and they cand be further used # (actually that the call was made from this method and was not the initial one, so it has to move on to the next request) response.meta['parsed'] = True response.meta['items'] = items return self.reduceResults(response)
def parse_product_samsung(self, response): hxs = HtmlXPathSelector(response) items = response.meta['items'] #site = response.meta['origin_site'] origin_url = response.meta['origin_url'] # create item item = SearchItem() item['product_url'] = response.url item['origin_url'] = origin_url item['origin_name'] = response.meta['origin_name'] # hardcode brand to sony item['product_brand'] = 'samsung' # extract product name, brand, model, etc; add to items product_info = hxs.select("//ul[@class='product-info']") #TODO: for some products name is not extracted correctly product_name = product_info.select("meta[@itemprop='name']/@content") if not product_name: self.log("Error: No product name: " + str(response.url), level=log.INFO) else: item['product_name'] = product_name.extract()[0] product_model = product_info.select( "meta[@itemprop='model']/@content") if product_model: item['product_model'] = product_model.extract()[0] #TODO # item['product_images'] = # #TODO: to check # item['product_videos'] = l items.add(item) # if there are any more results to be parsed, send a request back to this method with the next product to be parsed product_urls = response.meta['search_results'] if product_urls: request = Request(product_urls.pop(), callback=self.parse_product_samsung, meta=response.meta) request.meta['items'] = items # eliminate next product from pending list (this will be the new list with the first item popped) request.meta['search_results'] = product_urls return request else: # otherwise, we are done, send a the response back to reduceResults (no need to make a new request) # # we are finished so we should also close the driver # if self.driver: # self.driver.close() response.meta['parsed'] = True response.meta['items'] = items return self.reduceResults(response)
def parse_product_target(self, response): hxs = HtmlXPathSelector(response) items = response.meta['items'] #site = response.meta['origin_site'] origin_url = response.meta['origin_url'] item = SearchItem() item['product_url'] = response.url #item['origin_site'] = site item['origin_url'] = origin_url item['origin_name'] = response.meta['origin_name'] if 'origin_model' in response.meta: item['origin_model'] = response.meta['origin_model'] # extract product name #TODO: is this general enough? product_name = hxs.select( "//h2[@class='product-name item']/span[@itemprop='name']/text()" ).extract() # if you can't find product name in product page, use the one extracted from results page if not product_name: item['product_name'] = response.meta['product_name'] self.log( "Error: product name not found on product page, extracted from results page: " + item['product_name'] + " " + origin_url, level=log.INFO) else: item['product_name'] = product_name[0].strip() if not item['product_name']: self.log("Error: No product name: " + str(response.url) + " from product: " + origin_url, level=log.INFO) else: #TODO: no model number field? model_number_holder = None if model_number_holder: item['product_model'] = model_number_holder[0].strip() # if no product model explicitly on the page, try to extract it from name else: product_model_extracted = ProcessText.extract_model_from_name( item['product_name']) if product_model_extracted: item['product_model'] = product_model_extracted #print "MODEL EXTRACTED: ", product_model_extracted, " FROM NAME ", item['product_name'].encode("utf-8") #TODO: no brand field? # extract price #! extracting list price and not discount price when discounts available? #TODO: complete this with other types of pages price_holder = hxs.select( "//span[@class='offerPrice']/text()").extract() if price_holder: product_target_price = price_holder[0].strip() # remove commas separating orders of magnitude (ex 2,000) product_target_price = re.sub(",", "", product_target_price) m = re.match("\$([0-9]+\.?[0-9]*)", product_target_price) if m: item['product_target_price'] = float(m.group(1)) else: sys.stderr.write("Didn't match product price: " + product_target_price + " " + response.url + "\n") else: sys.stderr.write("Didn't find product price: " + response.url + "\n") # add result to items items.add(item) # if there are any more results to be parsed, send a request back to this method with the next product to be parsed product_urls_and_names = response.meta['search_results'] if product_urls_and_names: product_url_and_name = product_urls_and_names.pop() request = Request(product_url_and_name[0], callback=self.parse_product_target, meta=response.meta) request.meta['items'] = items # eliminate next product from pending list (this will be the new list with the first item popped) # send product name with request as well request.meta['product_name'] = product_url_and_name[1] request.meta['search_results'] = product_urls_and_names return request else: # otherwise, we are done, send a the response back to reduceResults (no need to make a new request) # add as meta newly added items # also add 'parsed' field to indicate that the parsing of all products was completed and they cand be further used # (actually that the call was made from this method and was not the initial one, so it has to move on to the next request) response.meta['parsed'] = True response.meta['items'] = items return self.reduceResults(response)
def parseResults(self, response): hxs = HtmlXPathSelector(response) if 'items' in response.meta: items = response.meta['items'] else: items = set() #results = hxs.select("//ul[@class='productsListView']/li") results = hxs.select("//li[contains(@class,'tile standard')]") for result in results: item = SearchItem() product_title_holder = result.select( ".//div[@class='tileInfo']/a[contains(@class,'productTitle')]") product_url = product_title_holder.select("@href").extract() product_name = product_title_holder.select("@title").extract() #print "ITEM", product_name # quit if there is no product name if product_name and product_url: # clean url m = re.match("(.*)#prodSlot*", url.extract(), product_url[0]) if m: item['product_url'] = m.group(1) else: item['product_url'] = product_url[0] item['product_name'] = product_name[0] else: self.log("No product name: " + str(response.url) + " from product: " + response.meta['origin_url'], level=log.ERROR) continue # add url, name and model of product to be matched (from origin site) item['origin_url'] = response.meta['origin_url'] item['origin_name'] = response.meta['origin_name'] if 'origin_model' in response.meta: item['origin_model'] = response.meta['origin_model'] # extract product model from name product_model_extracted = ProcessText.extract_model_from_name( item['product_name']) if product_model_extracted: item['product_model'] = product_model_extracted # extract price #! extracting regular price and not discount price when discounts available? price_holder = result.select( ".//p[@class='regularprice-label']//text()[contains(.,'$')]" ).extract() # second attempt at finding price if not price_holder: price_holder = result.select( ".//*[contains(@class, 'price price-label')]/text()[contains(.,'$')]" ).extract() if price_holder: product_target_price = price_holder[0].strip() # remove commas separating orders of magnitude (ex 2,000) product_target_price = re.sub(",", "", product_target_price) # if more than one match, it will get the first one m = re.match("\$([0-9]+\.?[0-9]*)", product_target_price) if m: item['product_target_price'] = float(m.group(1)) else: self.log("Didn't match product price: " + product_target_price + " " + response.url + "\n", level=log.WARNING) else: self.log("Didn't find product price: " + response.url + "\n", level=log.DEBUG) # extract product brand brand_holder = product_title_holder.select( "parent::node()//a[contains(@class,'productBrand')]/a/text()" ).extract() if brand_holder: item['product_brand'] = brand_holder[0] self.log("Extracted brand: " + item['product_brand'] + " from results page: " + str(response.url), level=log.DEBUG) # add result to items items.add(item) # extract product info from product pages (send request to parse first URL in list) # add as meta all that was received as meta, will pass it on to reduceResults function in the end # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed # send the request back to reduceResults (with updated 'items') whether there are any more pending requests or not # if there are, reduceResults will send the next one back here, if not it will return the final result response.meta['items'] = items # and field 'parsed' to indicate that the call was received from this method (was not the initial one) #TODO: do we still need this? response.meta['parsed'] = True # only send the response we have as an argument, no need to make a new request return self.reduceResults(response)
def parse_product(self, response): # redirect pages, if handled, can return empty bodies # especially for kohls if not response.body: self.log("Retried empty page: " + response.url, level=log.WARNING) return Request(response.url, callback = self.parse_product, meta=response.meta) # try to avoid mobile versions # especially for kohls if response.url.startswith("http://m."): meta = response.meta meta['dont_redirect'] = True url = re.sub("/m\.","/www.",response.url) self.log("Retrying: redirecting mobile page to www page", level=log.WARNING) return Request(url, callback=self.parse_product, meta=meta) origin_product_id = response.meta['origin_product_id'] current_query = response.meta['query'] origin_url = self.results[origin_product_id]['origin_product']['origin_url'] item = SearchItem() item['product_url'] = response.url for field in self.results[origin_product_id]['origin_product'].keys(): item[field] = self.results[origin_product_id]['origin_product'][field] # all product urls from all queries items = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['product_items'], \ self.results[origin_product_id]['search_requests']), []) # all product urls from all queries product_urls = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['search_results'], \ self.results[origin_product_id]['search_requests']), []) product_urls = set(product_urls) item = self.extract_product_data(response, item) # add result to items (if it was successful) if item: self.results[origin_product_id]['search_requests'][current_query]['product_items'].append(item) # try to send request to parse next product, try until url for next product url is valid (response not 404) # this is needed because if next product url is not valid, this request will not be sent and all info about this match (stored in request meta) will be lost # find first valid next product url next_product_url = None if product_urls: next_product_url = product_urls.pop() # if a next product url was found, send new request back to parse_product_url if next_product_url: request = Request(next_product_url, callback = self.parse_product, meta = response.meta) # eliminate next product from pending list (this will be the new list with the first item popped) self.remove_result_from_queue(origin_product_id, next_product_url) return request # if no next valid product url was found else: # we are done, send a the response back to reduceResults (no need to make a new request) # add as meta newly added items # also add 'parsed' field to indicate that the parsing of all products was completed and they cand be further used # (actually that the call was made from this method and was not the initial one, so it has to move on to the next request) response.meta['parsed'] = True return self.reduceResults(response)
def parseResults(self, response): hxs = HtmlXPathSelector(response) if 'items' in response.meta: items = response.meta['items'] else: items = set() results = hxs.select("//h3[@class='productTitle']/a") for result in results: item = SearchItem() product_url = result.select("@href").extract()[0] # extract all text in <a> (contains product name inside <strong>, and size(ml) directly in text()) # node containing full product name if the displayed one is abbreviated. use this one if exists, and displayed one if it doesn't product_name_node = result.select("strong/abbr/@title") product_name = product_name_node.extract( )[0] if product_name_node else result.select( "strong/text()").extract()[0] # assert name is not abbreviated assert '...' not in product_name # add product quantity product_quantity_node = result.select( "text()[normalize-space()!='']") product_quantity = product_quantity_node.extract()[0].strip( ) if product_quantity_node else "" product_name_full = product_name + " " + product_quantity #print "ITEM", product_name # quit if there is no product name if product_name and product_url: # clean url item['product_url'] = Utils.add_domain( Utils.clean_url(product_url), self.base_url) item['product_name'] = product_name_full else: self.log("No product name: " + str(response.url) + " from product: " + response.meta['origin_url'], level=log.ERROR) continue # add url, name and model of product to be matched (from origin site) item['origin_url'] = response.meta['origin_url'] item['origin_name'] = response.meta['origin_name'] if 'origin_model' in response.meta: item['origin_model'] = response.meta['origin_model'] # extract product model from name product_model_extracted = ProcessText.extract_model_from_name( item['product_name']) if product_model_extracted: item['product_model'] = product_model_extracted #TODO: extract: price, brand? # add result to items items.add(item) # extract product info from product pages (send request to parse first URL in list) # add as meta all that was received as meta, will pass it on to reduceResults function in the end # also send as meta the entire results list (the product pages URLs), will receive callback when they have all been parsed # send the request back to reduceResults (with updated 'items') whether there are any more pending requests or not # if there are, reduceResults will send the next one back here, if not it will return the final result response.meta['items'] = items # and field 'parsed' to indicate that the call was received from this method (was not the initial one) #TODO: do we still need this? response.meta['parsed'] = True # only send the response we have as an argument, no need to make a new request return self.reduceResults(response)
def parse_product_amazon(self, response): hxs = HtmlXPathSelector(response) origin_product_id = response.meta['origin_product_id'] current_query = response.meta['query'] origin_url = self.results[origin_product_id]['origin_product'][ 'origin_url'] item = SearchItem() item['product_url'] = response.url for field in self.results[origin_product_id]['origin_product'].keys(): item[field] = self.results[origin_product_id]['origin_product'][ field] # all product urls from all queries items = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['product_items'], \ self.results[origin_product_id]['search_requests']), []) # all product urls from all queries product_urls = sum(map(lambda q: self.results[origin_product_id]['search_requests'][q]['search_results'], \ self.results[origin_product_id]['search_requests']), []) product_urls = set(product_urls) #TODO: to test this #product_name = filter(lambda x: not x.startswith("Amazon Prime"), hxs.select("//div[@id='title_feature_div']//h1//text()[normalize-space()!='']").extract()) product_name_node = hxs.select( '//h1[@id="title"]/span[@id="productTitle"]/text()').extract() product_name = None if not product_name_node: product_name_node = hxs.select( '//h1[@id="aiv-content-title"]//text()').extract() if not product_name_node: product_name_node = hxs.select( '//div[@id="title_feature_div"]/h1//text()').extract() if product_name_node: product_name = product_name_node[0].strip() else: # needs special treatment product_name_node = hxs.select( '//h1[@class="parseasinTitle " or @class="parseasinTitle"]/span[@id="btAsinTitle"]//text()' ).extract() if product_name_node: product_name = " ".join(product_name_node).strip() if not product_name: # log this error: # if number of retries were not exhausted, it might just be a captcha page, not an insurmonutable error if 'captcha_retries' in response.meta and response.meta[ 'captcha_retries'] <= self.MAX_CAPTCHA_RETRIES: self.log("Error: No product name: " + str(response.url) + " for walmart product " + origin_url, level=log.WARNING) else: # if it comes from a solved captcha page, then it's an error if it's still not found self.log("Error: No product name: " + str(response.url) + " for walmart product " + origin_url, level=log.ERROR) # try this: don't remove captcha_retries from meta, may cause infinite loops, works # if response.meta['captcha_retries'] > self.MAX_CAPTCHA_RETRIES: # del response.meta['captcha_retries'] # if we have reached maximum number of retries, do nothing (item just won't be added to the "items" list) # if we haven't reached maximum retries, try again if 'captcha_retries' not in response.meta \ or 'captcha_retries' in response.meta and response.meta['captcha_retries'] <= self.MAX_CAPTCHA_RETRIES: # assume there is a captcha to crack # check if there is a form on the page - that means it's probably the captcha form forms = hxs.select("//form") if forms: # solve captcha captcha_text = None image = hxs.select(".//img/@src").extract() if image: captcha_text = self.CB.solve_captcha(image[0]) # value to use if there was an exception if not captcha_text: captcha_text = '' # create a FormRequest to this same URL, with everything needed in meta # items, cookies and search_urls not changed from previous response so no need to set them again # redo the entire request (no items will be lost) meta = response.meta # flag indicating how many times we already retried to solve captcha if 'captcha_retries' in meta: meta['captcha_retries'] += 1 else: meta['captcha_retries'] = 1 return [ FormRequest.from_response( response, callback=self.parse_product_amazon, formdata={'field-keywords': captcha_text}, meta=meta) ] else: item['product_name'] = product_name # extract product model number model_number_holder = hxs.select( """//tr[@class='item-model-number']/td[@class='value']/text() | //li/b/text()[normalize-space()='Item model number:']/parent::node()/parent::node()/text() | //span/text()[normalize-space()='Item model number:']/parent::node()/parent::node()/span[2]/text()""" ).extract() if model_number_holder: item['product_model'] = model_number_holder[0].strip() # if no product model explicitly on the page, try to extract it from name else: product_model_extracted = ProcessText.extract_model_from_name( item['product_name']) if product_model_extracted: item['product_model'] = product_model_extracted ## print "MODEL EXTRACTED: ", product_model_extracted, " FROM NAME ", item['product_name'].encode("utf-8") upc_node = hxs.select( "//li/b/text()[normalize-space()='UPC:']/parent::node()/parent::node()/text()" ).extract() if upc_node: upc = upc_node[0].strip().split() item['product_upc'] = upc manufacturer_code_node = hxs.select( "//li/b/text()[normalize-space()='Manufacturer reference:']/parent::node()/parent::node()/text()" ).extract() if manufacturer_code_node: manufacturer_code = manufacturer_code_node[0].strip() item['manufacturer_code'] = manufacturer_code try: # for lowest level category: # TODO: test the xpath for the second type of page (see second type of xpath for top-level category) # bestsellers_rank = hxs.select("//tr[@id='SalesRank']/td[@class='value']/ul/li/span/text()" + \ # "| //li[@id='SalesRank']/ul/li/span/text()").re("#[0-9,]+")[0] # for top-level category: bestsellers_rank = hxs.select( "//tr[@id='SalesRank']/td[@class='value']/text()" + " | //li[@id='SalesRank']/text()").re("#[0-9,]+")[0] item['bestsellers_rank'] = int( re.sub(",", "", "".join(bestsellers_rank[1:]))) except Exception, e: if self.output == 6 or self.bestsellers_link: self.log("Didn't find product rank: " + str(e) + " " + response.url + "\n", level=log.INFO) asin_node = hxs.select( "//li/b/text()[normalize-space()='ASIN:']/parent::node()/parent::node()/text()" ).extract() if asin_node: item['product_asin'] = asin_node[0].strip() brand_holder = hxs.select( "//div[@id='brandByline_feature_div']//a/text() | //a[@id='brand']/text()" ).extract() if brand_holder: item['product_brand'] = brand_holder[0] else: pass #sys.stderr.write("Didn't find product brand: " + response.url + "\n") # extract price #! extracting list price and not discount price when discounts available? price_holder = hxs.select("//span[contains(@id,'priceblock')]/text() | //span[@class='a-color-price']/text() " + \ "| //span[@class='listprice']/text() | //span[@id='actualPriceValue']/text() | //b[@class='priceLarge']/text() | //span[@class='price']/text()").extract() # if we can't find it like above try other things: if not price_holder: # prefer new prices to used ones # TODO: doesn't work for amazon.co.uk (pounds), but isn't needed bery often price_holder = hxs.select( "//span[contains(@class, 'olp-new')]//text()[contains(.,'$')]" ).extract() if price_holder: product_target_price = price_holder[0].strip() # remove commas separating orders of magnitude (ex 2,000) product_target_price = re.sub(",", "", product_target_price) m = re.match("(\$|\xa3)([0-9]+\.?[0-9]*)", product_target_price) if m: item['product_target_price'] = float(m.group(2)) currency = m.group(1) if currency != "$": item[ 'product_target_price'] = Utils.convert_to_dollars( item['product_target_price'], currency) else: self.log("Didn't match product price: " + product_target_price + " " + response.url + "\n", level=log.WARNING) else: self.log("Didn't find product price: " + response.url + "\n", level=log.INFO) try: item['product_category_tree'] = \ filter(None, map(lambda c: c.strip(), hxs.select("//ul[li[@class='a-breadcrumb-divider']]/li/span[@class='a-list-item']/a/text()").extract())) except: pass try: item['product_keywords'] = hxs.select( "//meta[@name='keywords']/@content").extract()[0] except: pass try: product_image = hxs.select( "//img[@id='landingImage']/@src").extract()[0] item['product_image_url'] = product_image item['product_image_encoded'] = ProcessText.encode_image( product_image) except: pass # add result to items self.results[origin_product_id]['search_requests'][current_query][ 'product_items'].append(item)
def reduceResults(self, response): # print "IN REDUCE RESULTS" items = response.meta['items'] #site = response.meta['origin_site'] #TODO: do we still need this? if 'parsed' not in response.meta: # pass to specific prase results function (in derived class) return self.parseResults(response) else: del response.meta['parsed'] ## print stuff self.log("PRODUCT: " + response.meta['origin_name'].encode("utf-8") + " MODEL: " + response.meta['origin_model'].encode("utf-8"), level=log.DEBUG) self.log("QUERY: " + response.meta['query'], level=log.DEBUG) self.log("MATCHES: ", level=log.DEBUG) for item in items: self.log(item['product_name'].encode("utf-8"), level=log.DEBUG) self.log('\n', level=log.DEBUG) # if there is a pending request (current request used product model, and pending request is to use product name), # continue with that one and send current results to it as metadata if 'pending_requests' in response.meta: # yield first request in queue and send the other ones as metadata pending_requests = response.meta['pending_requests'] if pending_requests: # print "PENDING REQUESTS FOR", response.meta['origin_url'], response.meta['origin_name'] request = pending_requests[0] # update pending requests request.meta['pending_requests'] = pending_requests[1:] request.meta['items'] = items #request.meta['origin_site'] = response.meta['origin_site'] # product page from source site request.meta['origin_url'] = response.meta['origin_url'] request.meta['origin_name'] = response.meta['origin_name'] request.meta['origin_model'] = response.meta['origin_model'] if 'origin_price' in response.meta: request.meta['origin_price'] = response.meta[ 'origin_price'] request.meta['origin_brand_extracted'] = response.meta[ 'origin_brand_extracted'] if 'threshold' in response.meta: request.meta['threshold'] = response.meta['threshold'] # if 'origin_id' in response.meta: # request.meta['origin_id'] = response.meta['origin_id'] # assert self.by_id # else: # assert not self.by_id # used for result product URLs if 'search_results' in response.meta: request.meta['search_results'] = response.meta[ 'search_results'] return request # if there are no more pending requests, use cumulated items to find best match and send it as a result else: # print "DONE FOR ", response.meta['origin_url'], response.meta['origin_name'] best_match = None if items: # from all results, select the product whose name is most similar with the original product's name # if there was a specific threshold set in request, use that, otherwise, use the class variable if 'threshold' in response.meta: threshold = response.meta['threshold'] else: threshold = self.threshold if 'origin_price' in response.meta: product_price = response.meta['origin_price'] ## print "PRICE:", product_price else: product_price = None ## print "NO PRICE" best_match = ProcessText.similar( response.meta['origin_name'], response.meta['origin_model'], product_price, items, threshold) # #self.log( "ALL MATCHES: ", level=log.WARNING) # for item in items: # ## print item['product_name'].encode("utf-8") # ## print '\n' self.log("FINAL: " + str(best_match), level=log.WARNING) self.log("\n----------------------------------------------\n", level=log.WARNING) if not best_match: # if there are no results but the option was to include original product URL, create an item with just that # output item if match not found for either output type #if self.output == 2: item = SearchItem() #item['origin_site'] = site item['origin_url'] = response.meta['origin_url'] item['origin_name'] = response.meta['origin_name'] if 'origin_model' in response.meta: item['origin_model'] = response.meta['origin_model'] # if 'origin_id' in response.meta: # item['origin_id'] = response.meta['origin_id'] # assert self.by_id # else: # assert not self.by_id return [item] return best_match else: # output item if match not found item = SearchItem() #item['origin_site'] = site # print "DONE FOR ", response.meta['origin_name'] item['origin_url'] = response.meta['origin_url'] item['origin_name'] = response.meta['origin_name'] # if 'origin_id' in response.meta: # item['origin_id'] = response.meta['origin_id'] # assert self.by_id # else: # assert not self.by_id #TODO: uncomment below - it should not have been in if/else branch! self.log("FINAL: " + str(item), level=log.WARNING) self.log("\n----------------------------------------------\n", level=log.WARNING) return [item]