def parse(self, response): page_1_str=self.page_str+"1" this_url = trim_url(response.url,page_1_str) print (f"inside parse for {this_url}") self.scrape_urls(response) # Only scrape pages that have the page_str in the url. if this_url.find(self.page_str) != -1: print (f"scraping for {this_url}") items = response.css('product-item-v2') print(f"length of items - {len(items)}") metadata=get_url_metadata(self.cursor,this_url) section=metadata[1] subsection=metadata[2] for item in items: name = item.css('.product-title ::text').get() price_strings = item.css('.product-price ::text').getall() price = clean_string(price_strings[-1],['$']) ppu = item.css('.product-price-qty ::text').get() unit = self.collect_units(name) #inspect_response(response,self) if unit == "OZ" or unit == "LB": ounces = self.collect_ounces(name) else: ounces = 0 print (f"yielding - {name}, {price}, {ppu}, {ounces}, {unit}") yield{ "name": name, "price": price, "ounces": ounces, "unit": unit, "price-per-unit": ppu, "url": this_url, "section": section, "subsection": subsection } #Basically the website redirects us to the url and page_1_str, which isn't added to our database # So we trim that off so we can get the url in our database finish_url(self.conn,self.store_id,this_url) print("finishing url - " + this_url) next_url = get_next_url(self.cursor, 1) if next_url is None: print ("Next url is none therefore we must be finished ! ") return else: next_request = create_parse_request(next_url, self.check_location, EC.element_to_be_clickable((By.CSS_SELECTOR,'#openFulfillmentModalButton'))) print(f"got next_url - {next_url}") yield next_request
def parse(self, response): time.sleep(1) url = response.url print(f"inside parse for {url}") PRODUCTS_CSS = '#product-main' metadata = get_url_metadata(self.cursor, url) if metadata is None: print("Could not find metadata for url - " + url + " - skipping") finish_url(self.conn, self.store_id, url) return section = metadata[1] subsection = metadata[2] products = response.css(PRODUCTS_CSS) for product in products: name = product.css('.product-name ::text').get() name = name.replace("'", "") raw_price = product.css('.product-price ::text').get() price = re.findall( "[0-9]+.[0-9]*", raw_price )[0] # This will filter out any of the $'s and other text in the price' quantity = product.css('.product-quantity ::text').get() index_split = quantity.find('|') ppu = quantity[index_split + 1:] amount = quantity[:index_split] ounces = self.collect_ounces(amount) unit = self.collect_unit(amount) print(f"yielding - {name}, {price}, {ppu}, {ounces}, {unit}") yield { "name": name, "price": price, "ounces": ounces, "unit": unit, "price-per-unit": ppu, "url": url, "section": section, "subsection": subsection } check_subsection_amount(self.cursor, url) finish_url(self.conn, self.store_id, url)
def parse(self, response): self.driver = response.request.meta['driver'] close_modal(self) change_store_location(self) url = response.url metadata = get_url_metadata(self.cursor, url) section = metadata[1] subsection = metadata[2] #check if it has a next button, items = response.css('.cell-content-wrapper') for item in items: name = item.css('.cell-title-text ::text').get() name = clean_string(name, ['\"']) price = item.css('[data-test="amount"] .css-19m8h51 ::text').get() price = convert_dollars(price) quantity = item.css('[data-test="amount"] .css-cpy6p ::text').get() unit = item.css('.cell-product-size ::text').get() ounces = convert_to_ounces(unit) ppu = item.css('[data-test="per-unit-price"] ::text').get() ppu = convert_ppu(ppu) self.logger.info( f"name - {name}, price - {price}, quantity - {quantity}, ounces - {ounces}, ppu - {ppu}, url - {url}, section - {section}, subsection - {subsection} " ) #inspect_response(response,self) yield { "name": name, "price": price, "ounces": ounces, "unit": unit, "price-per-unit": ppu, "url": url, "section": section, "subsection": subsection } finish_url(self.conn, self.store_id, url) request = self.get_next_request() yield request
def handle_pagination(self, response): self.logger.info('inside handle_pagination') url = self.driver.current_url next_button = self.driver.find_elements_by_css_selector( self.NEXT_BUTTON_SELECTOR) # inspect_response(response,self) if len(next_button) != 0: next_page_url = get_next_pagination(self.PAGE_STRING, url) metadata = get_url_metadata(self.cursor, url) category = metadata[0] section = metadata[1] subsection = metadata[2] quantity = self.driver.find_element_by_css_selector( '.Title__browseTotalCount___OWylh').get_attribute('innerText') quantity = re.findall('[0-9]+', quantity)[0] store_url(self.conn, next_page_url, self.store_id, category, section, subsection, grocery_quantity=quantity) finish_url(self.conn, self.store_id, url, scrape_urls=True) next_url = get_next_url(self.cursor, 1, store_id=self.store_id, scrape_urls=True, filter="aisle=") if next_url is None: self.logger.debug( "Next_url is None therefore we must be finished!") return request = create_parse_request(next_url, self.handle_pagination, EC.element_to_be_clickable( (By.CSS_SELECTOR, self.PAGE_LOAD)), errback=self.retry, meta_url=next_url, cookies=False) yield request
def handle_pagination(self, response): # if it has a page-last class, read that content, and interprolate # else, get the last pager, page and interprolate self.logger.info("Inside handle_pagination") close_modal(self) change_store_location(self) base_url = response.url string_location=base_url.find(self.page_string) if string_location != -1: base_url = base_url[:string_location] pag_last = self.driver.find_elements_by_css_selector( '.pagination-last.pager-item') if pag_last: final_page_number = pag_last[0].text else: last_page = self.driver.find_elements_by_css_selector( '.pagination-page.pager-item')[-1] final_page_number = last_page.text final_page_number = int(final_page_number) metadata = get_url_metadata(self.cursor, base_url) category = metadata[0] section = metadata[1] subsection = metadata[2] for page_num in range(1, final_page_number+1): # Something like - # https://shop.wegmans.com/shop/categories/94 ?page= 13 page_url = base_url + self.page_string + str(page_num) store_url(self.conn, page_url, self.store_id, category, section, subsection) self.logger.info(f"finished handling pagination for {base_url}") finish_url(self.conn, self.store_id, response.url, scrape_urls=True) request = self.get_next_request() yield request
def parse(self, response): url = response.url self.logger.info(f"Inside parse for {url}") GROCERY_SELECTOR = '[data-automation-id="productTile"]' SPONSORED_SELECTOR = '[data-automation-id="sponsoredProductTile"]' GROCERIES_SELECTOR = GROCERY_SELECTOR + ',' + SPONSORED_SELECTOR metadata=get_url_metadata(self.cursor,url) section=metadata[1] subsection=metadata[2] for grocery in response.css(GROCERIES_SELECTOR): NAME_SELECTOR = '[data-automation-id="name"] ::attr(name)' name = grocery.css(NAME_SELECTOR).extract_first() #parse the ounces off of the name decimal_regex = "([\d]+[.]?[\d]*|[.\d]+)" ounces = re.findall(decimal_regex + "\s*o(?:z|unces?)", name, re.IGNORECASE) pounds = re.findall(decimal_regex + "\s*(?:pound|lb)s?", name, re.IGNORECASE) count = re.findall("([\d]+)\s*(?:c(?:t|ount)|p(?:k|ack))", name, re.IGNORECASE) self.ounce = ounces self.pounds = pounds self.count = count #Check if the arrays returned from re.findall are empty if ounces: ounces = parse_float(ounces[0]) else: ounces = 0 if pounds: pounds = parse_float(pounds[0]) else: pounds = 0 if count: count = parse_float(count[0]) else: count = 0 if pounds != 0: ounces = 16*pounds elif count != 0: ounces *= count # inspect_response(response,self) SALEPRICE_SELECTOR = '[data-automation-id="salePrice"] ::text' PRICE_SELECTOR = '[data-automation-id="price"] ::text' PRICE_PER_UNIT_SELECTOR = '[data-automation-id="price-per-unit"] ::text' name=grocery.css(NAME_SELECTOR).extract_first() name=clean_string(name,"\"") ounces=ounces pounds=pounds count=count price=str(handle_none(grocery.css(SALEPRICE_SELECTOR).extract_first())).replace('$','') ppu=convert_ppu(grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first()) yield { 'name': name, 'ounces': ounces, 'pounds': pounds, 'count': count, 'price': price, 'price-per-unit': ppu, 'section': section, 'subsection': subsection, 'url': url, } finish_url(self.conn,self.store_id,url) next_url=get_next_url(self.cursor,1,store_id=self.store_id,filter="aisle=") print(f"next_url - {next_url}") if next_url is None: print ("No more urls - finishing") else: request = create_parse_request(next_url, self.parse, EC.element_to_be_clickable( (By.CSS_SELECTOR, '[aria-current="page"]')), meta_url=next_url) yield request
def parse(self, response): page_1_str = self.page_str + "1" meta_url = response.meta.get('url') this_url = response.url #trim_url(response.url,page_1_str) trimmed_url = trim_url(response.url, page_1_str) self.logger.info( f"inside parse for meta_url: {meta_url}, response.url: {response.url}" ) # Only scrape pages that have the page_str in the url. if this_url.find(self.page_str) == -1: self.logger.info( f"Skipping {this_url} because it couldn't find {self.page_str}" ) elif meta_url != response.url: self.logger.info( f"meta_url: {meta_url} != response.url: {response.url}, and so we are finishing stale {meta_url}" ) this_url = meta_url else: self.logger.info(f"scraping for {this_url}") items = response.css('product-item-v2') self.logger.info(f"length of items - {len(items)}") ## FIXME For some reason the middleware is returning an empty response for all of the urls that reach here. #inspect_response(response,self) metadata = get_url_metadata(self.cursor, trimmed_url) if len(metadata) != 3: self.logger.info( f"Couldn't detect metadata: {metadata}, for trimmed_url: {trimmed_url}, defaulting to empty" ) section = "" subsection = "" else: section = metadata[1] subsection = metadata[2] for item in items: name = item.css('.product-title ::text').get() price_string = item.css('.product-price').get() price = re.findall("\$([0-9]+\.[0-9]+)", price_string)[0] ppu = item.css('.product-price-qty ::text').get() unit = self.collect_units(name) if unit == "OZ" or unit == "LB": ounces = self.collect_ounces(name) else: ounces = 0 self.logger.info( f"yielding - {name}, {price}, {ppu}, {ounces}, {unit}") yield { "name": name, "price": price, "ounces": ounces, "unit": unit, "price-per-unit": ppu, "url": this_url, "section": section, "subsection": subsection } #Basically the website redirects us to the url and page_1_str, which isn't added to our database # So we trim that off so we can get the url in our database finish_url(self.conn, self.store_id, this_url) self.logger.info("finishing url - " + this_url + ", store_id: ", self.store_id) # We only want requests that have the page= string in it because they have the groceries, # Also currently we're getting some urls in our database for locations that don't match our default_store_number # So filter those out too. next_url = get_next_url(self.cursor, 1, self.store_id, filter=f"{self.store_number}%page=") if next_url is None: self.logger.info( "Next url is none therefore we must be finished ! ") return else: next_request = create_unfiltered_parse_request( next_url, self.parse, EC.element_to_be_clickable( (By.CSS_SELECTOR, 'product-item-v2'))) self.logger.info(f"got next_url - {next_url}") yield next_request
def parse(self, response): url = response.url finish_url(self.conn, self.store_id, url) items = response.css('.cell-content-wrapper') metadata = get_url_metadata(self.cursor, url) section = metadata[1] subsection = metadata[2] #check if it has a next button, next_page = response.css('.pagination-next:not(.disabled)').get() if next_page is not None: #inspect_response(response,self) page_string = "?page=" page_str_len = len(page_string) i = url.find(page_string) #if yes, check url if it has a page part on it if i == -1: #if no, add ?page=2 to it next_url = url + page_string + "2" else: #if yes, extract page and add 1 page_number = i + page_str_len current_page = int(url[page_number:]) next_page = current_page + 1 next_url = url[:page_number] + str(next_page) #then add to self.urls store_url(self.conn, next_url, self.store_id, lookup_category("", section, subsection), section, subsection) for item in items: name = item.css('.cell-title-text ::text').get() name = clean_string(name, ['\"']) price = item.css('[data-test="amount"] .css-19m8h51 ::text').get() price = convert_dollars(price) quantity = item.css('[data-test="amount"] .css-cpy6p ::text').get() unit = item.css('.cell-product-size ::text').get() ounces = convert_to_ounces(unit) ppu = item.css('[data-test="per-unit-price"] ::text').get() ppu = convert_ppu(ppu) print( f"name - {name}, price - {price}, quantity - {quantity}, ounces - {ounces}, ppu - {ppu}, url - {url}, section - {section}, subsection - {subsection} " ) #inspect_response(response,self) yield { "name": name, "price": price, "ounces": ounces, "unit": unit, "price-per-unit": ppu, "url": url, "section": section, "subsection": subsection } next_url = get_next_url(self.cursor, 1) if next_url is None: print("No more URLs to parse. Finishing") return request = self.create_parse_request( next_url, self.parse, EC.element_to_be_clickable((By.CSS_SELECTOR, '[add-to-cart]'))) if next_url is not None: try: yield request except: print( f"Parse - Errored out processing request for - {next_url} " ) next_url = get_next_url(self.cursor, 2) print(f"Parse - Now handling {next_url}") request = self.create_parse_request( next_url, self.parse, EC.element_to_be_clickable( (By.CSS_SELECTOR, '[add-to-cart]'))) yield SeleniumRequest(url=next_url, callback=self.parse, wait_time=50, wait_until=EC.element_to_be_clickable( (By.CSS_SELECTOR, '.button.full.cart.add')))
def parse(self, response): # This callback determines if the selected menu is # at the top of the list, if it is then it adds the urls # to the list and keeps going # if its not, then it calls the lua to prepare the page # for scraping, and then scrapes it url = response.url menu = response.css(".category-filter__link") #submenu = response.css("") #self.logger.info ("self.urls - " +str(self.urls)) self.logger.info("processing response.url - " + response.url) #self.logger.info ("menu: ") #self.logger.info (menu.getall()) #self.logger.info ("len(menu): " + str(len(menu))) #self.logger.info ("menu[0] : " + menu.get()) #self.logger.info("name - " + menu[0].css('.category-filter__text ::text').get()) #inspect_response(response,self) if not (len(menu) > 0 and menu[0].css('[aria-current="page"]')): #we are on a subpage, so now we can start scraping # TODO check to see if we should just scrape all pages? GROCERY_SELECTOR = '.grid-item' NAME_SELECTOR = '.small-type.detail-card-description ::text' PRICE_SELECTOR = '.price ::text' PRICE_PER_UNIT_SELECTOR = '.sub-headline.detail-card-subtext ::text' metadata = get_url_metadata(self.cursor, url) if metadata is None: self.logger.debug(f"Metadata is none for {url}") metadata = ["", ""] section = metadata[0] subsection = metadata[1] self.logger.info("subpage - scraping " + url + ", from section - " + section) for grocery in response.css(GROCERY_SELECTOR): self.name = grocery.css(NAME_SELECTOR).extract_first() self.price = grocery.css(PRICE_SELECTOR).extract_first() if self.price is not None: self.price = self.price.replace('*', '').replace('$', '') self.ppu = grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first() if self.ppu is not None: self.ppu = convert_ppu(self.ppu) #inspect_response(response, self) #parse the ounces off of the name yield { 'name': self.name, 'price': self.price, 'price-per-unit': self.ppu, 'section': section, 'subsection': subsection, 'url': response.url } finish_url(self.conn, self.store_id, url) self.logger.info("finishing url - " + url) next_url = get_next_url(self.cursor, 1, store_id=self.store_id) if next_url is not None: self.logger.info("got next_url - " + next_url) yield SplashRequest( next_url, self.parse, endpoint='execute', dont_filter=True, args={'lua_source': self.expand_and_scroll_lua}) else: self.logger.info( "Next url is none therefore we must be finished ! ")
def parse(self, response): GROCERY_SELECTOR = '[data-automation-id="productTile"]' SPONSORED_SELECTOR = '[data-automation-id="sponsoredProductTile"]' GROCERIES_SELECTOR = GROCERY_SELECTOR + ',' + SPONSORED_SELECTOR NEXT_BUTTON = '[data-automation-id="nextButton"]' # Handle pagination url = response.url print (f"working on url - {url}") metadata=get_url_metadata(self.cursor,url) section=metadata[1] subsection=metadata[2] next_page=response.css(NEXT_BUTTON).get() if next_page is not None: #inspect_response(response,self) page_string="&page=" page_str_len=len(page_string) next_page_url=get_next_pagination(page_string,url) store_url(self.conn,next_page_url, self.store_id, lookup_category("",section,subsection) ,section, subsection) for grocery in response.css(GROCERIES_SELECTOR): NAME_SELECTOR = '[data-automation-id="name"] ::attr(name)' self.name = grocery.css(NAME_SELECTOR).extract_first() #parse the ounces off of the name decimal_regex = "([\d]+[.]?[\d]*|[.\d]+)" self.ounces = re.findall(decimal_regex + "\s*o(?:z|unces?)", self.name, re.IGNORECASE) self.pounds = re.findall(decimal_regex + "\s*(?:pound|lb)s?", self.name, re.IGNORECASE) self.count = re.findall("([\d]+)\s*(?:c(?:t|ount)|p(?:k|ack))", self.name, re.IGNORECASE) #Check if the arrays returned from re.findall are empty if self.ounces: self.ounces = parse_float(self.ounces[0]) else: self.ounces = 0 if self.pounds: self.pounds = parse_float(self.pounds[0]) else: self.pounds = 0 if self.count: self.count = parse_float(self.count[0]) else: self.count = 0 if self.pounds != 0: self.ounces = 16*self.pounds elif self.count != 0: self.ounces *= self.count # inspect_response(response,self) SALEPRICE_SELECTOR = '[data-automation-id="salePrice"] ::text' PRICE_SELECTOR = '[data-automation-id="price"] ::text' PRICE_PER_UNIT_SELECTOR = '[data-automation-id="price-per-unit"] ::text' name=grocery.css(NAME_SELECTOR).extract_first() name=clean_string(name,"\"") ounces=self.ounces pounds=self.pounds count=self.count price=str(handle_none(grocery.css(SALEPRICE_SELECTOR).extract_first())).replace('$','') ppu=convert_ppu(grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first()) url=response.url yield { 'name': name, 'ounces': ounces, 'pounds': pounds, 'count': count, 'price': price, 'price-per-unit': ppu, 'section': section, 'subsection': subsection, 'url': url, } finish_url(self.conn,self.store_id,url) next_url=get_next_url(self.cursor,1) print(f"next_url - {next_url}") if next_url is None: print ("No more urls - finishing") else: yield SplashRequest(next_url, self.parse, endpoint='render.html', args={ 'wait': 10, 'section': section, 'subsection': subsection })
def parse(self, response): # This callback determines if the selected menu is # at the top of the list, if it is then it adds the urls # to the list and keeps going # if its not, then it calls the lua to prepare the page # for scraping, and then scrapes it url = response.url menu = response.css(".category-filter__link") #submenu = response.css("") #print ("self.urls - " +str(self.urls)) print("processing response.url - " + response.url) #print ("menu: ") #print (menu.getall()) #print ("len(menu): " + str(len(menu))) #print ("menu[0] : " + menu.get()) #print("name - " + menu[0].css('.category-filter__text ::text').get()) #inspect_response(response,self) if (len(menu) > 0 and menu[0].css('[aria-current="page"]')): print(f"inside menu page for url - {url}") # The top page is active #print ("menu[0] : [aria-current=page] " + menu[0].css('[aria-current="page"]').get()) # therefore we need to scrape the links, and continue searching # we then need to loop through each other page. # call parse, and scrape it is not menu_url = menu[0].css('::attr(href)').get() menu_name = menu[0].css('.category-filter__text ::text').get() for item in menu: heading = item.css('.category-filter__text ::text').get() scraped_url = item.css('::attr(href)').get() scraped_url = self.base_url + scraped_url section = menu_name subsection = heading category = lookup_category("", section, subsection) store_url(self.conn, scraped_url, self.store_id, category, section, subsection) #self.section_dict[url]=(menu_name, heading) #if self.urls.count(url) == 0: # self.urls.append(url) #urls=menu.css('::attr(href)').getall() # Remove the the first(this) page from list to parse #urls.pop() #self.urls.extend(urls) #print("urls to scrape - " + str(self.urls)) #print("local urls - " + str(urls)) """ while len(self.urls) != 0: url = self.urls.pop() self.processedUrls.append(url) #url = self.base_url + url_suffix #print ("urls - " + str(self.urls)) #print ("pulling from url - " + url) #print ("urls lengths - " + str(len(self.urls))) yield SplashRequest(url, self.parse, endpoint='execute', args={'lua_source': self.expand_and_scroll_lua}) """ elif (len(menu) == 0): inspect_response(response, self) else: #we are on a subpage, so now we can start scraping # GROCERY_SELECTOR = '.grid-item' NAME_SELECTOR = '.small-type.detail-card-description ::text' PRICE_SELECTOR = '.price ::text' PRICE_PER_UNIT_SELECTOR = '.sub-headline.detail-card-subtext ::text' metadata = get_url_metadata(self.cursor, url) section = metadata[0] subsection = metadata[1] print("subpage - scraping " + url + ", from section - " + section) for grocery in response.css(GROCERY_SELECTOR): self.name = grocery.css(NAME_SELECTOR).extract_first() self.price = grocery.css(PRICE_SELECTOR).extract_first() if self.price is not None: self.price = self.price.replace('*', '').replace('$', '') self.ppu = grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first() if self.ppu is not None: self.ppu = convert_ppu(self.ppu) #inspect_response(response, self) #parse the ounces off of the name yield { 'name': self.name, 'price': self.price, 'price-per-unit': self.ppu, 'section': section, 'subsection': subsection, 'url': response.url } finish_url(self.conn, self.store_id, url) print("finishing url - " + url) next_url = get_next_url(self.cursor, 1) if next_url is not None: print("got next_url - " + next_url) yield SplashRequest( next_url, self.parse, endpoint='execute', dont_filter=True, args={'lua_source': self.expand_and_scroll_lua}) else: print("Next url is none therefore we must be finished ! ")