def start_requests(self): url = self.start_urls[0] request = create_unfiltered_parse_request(url, self.collect_menu, EC.element_to_be_clickable( (By.CSS_SELECTOR, '[id="catalog-nav-main-shop.categories"]')), meta_url=url ) if is_url_scraped(self.cursor, url, self.store_id, scrape_urls=True): #next_url = get_next_url(self.cursor, 1, store_id=self.store_id, # scrape_urls=True,filter=self.page_string,reverse_filter=True) next_url = get_next_url(self.cursor, 1, store_id=self.store_id, scrape_urls=True) request = create_unfiltered_parse_request(next_url, self.handle_first_request, EC.element_to_be_clickable( (By.CSS_SELECTOR, '#shopping-selector-parent-process-modal-close-click')), meta_url=next_url, errback=self.handle_pagination ) store_url(self.conn, url, self.store_id, "Start", "", "") yield request
def collect_menu(self, response): self.logger.info("inside collect_menu! ") self.driver = response.request.meta['driver'] close_modal(self) change_store_location(self) departments = self.driver.find_elements_by_css_selector( '[category-filter="subcategory"]') for department in departments: dept_name = department.find_element_by_css_selector( '[data-test="category-card-"]').text aisles = department.find_elements_by_css_selector('a') self.logger.info(f"dept_name: {dept_name}") self.aisles = aisles for aisle in aisles: aisle_name = aisle.text aisle_url = aisle.get_attribute("href") category = lookup_category("", dept_name, aisle_name) store_url(self.conn, aisle_url, self.store_id, category, dept_name, aisle_name) #inspect_response(response, self) self.logger.info("finished collect_menu! ") finish_url(self.conn, self.store_id, response.url, scrape_urls=True) request = self.get_next_request() yield request
def scrape_urls(self, response): #1. sort through data and extract urls #2. put urls together #3. Loop to each url, returning @parse base_url = "https://www.walmart.com" self.raw = response.body_as_unicode() #print("raw: " + self.raw) remove = ['{', '}', 'Link', ' '] self.cleaned = self.raw for char in remove: self.cleaned = self.cleaned.replace(char, '') self.comma_split = self.cleaned.split('","') #print ("cleaned - " + cleaned) #print ("comma_split - " ) #print (*comma_split) self.colon_split = [entry.split('":"') for entry in self.comma_split] #inspect_response(response, self) self.colon_split[0].remove('"sections') #print ("colon_split - ") #print (*colon_split) self.urls = [entry[-1] for entry in self.colon_split] #print("urls - ") #print(self.urls) section = "unset" subsection = "unset" self.section_dict = {} chars_to_remove=["\'","&"] for entry in self.colon_split: # each entry will have a subheading (normally at 0 unless it has a heading entry) section = clean_string(entry[0],chars_to_remove) url_end = clean_string(entry[-1],"\"") # if its a section header it will contain 3 entries # and all subsequent entries will have the same heading if len(entry) > 2: section = clean_string(entry[0],chars_to_remove) subsection = clean_string(entry[1],chars_to_remove) url = base_url + url_end category=lookup_category("",section,subsection) store_url(self.conn,url,self.store_id,category,section,subsection) #self.section_dict[url] = (self.section, self.subsection) #print(section, subsection, url) next_url=get_next_url(self.cursor, 1) if next_url is None: print("No more urls to parse finishing") else: yield SplashRequest(url, self.parse, endpoint='render.html', args={ 'wait': 10, 'section': section, 'subsection': subsection })
def parse(self, response): # This callback determines if the selected menu is # at the top of the list, if it is then it adds the urls # to the list and keeps going # if its not, then it calls the lua to prepare the page # for scraping, and then scrapes it url = response.url menu = response.css(".category-filter__link") #submenu = response.css("") #print ("self.urls - " +str(self.urls)) print("processing response.url - " + response.url) #print ("menu: ") #print (menu.getall()) #print ("len(menu): " + str(len(menu))) #print ("menu[0] : " + menu.get()) #print("name - " + menu[0].css('.category-filter__text ::text').get()) #inspect_response(response,self) if (len(menu) > 0 and menu[0].css('[aria-current="page"]')): print(f"inside menu page for url - {url}") # The top page is active #print ("menu[0] : [aria-current=page] " + menu[0].css('[aria-current="page"]').get()) # therefore we need to scrape the links, and continue searching # we then need to loop through each other page. # call parse, and scrape it is not menu_url = menu[0].css('::attr(href)').get() menu_name = menu[0].css('.category-filter__text ::text').get() for item in menu: heading = item.css('.category-filter__text ::text').get() scraped_url = item.css('::attr(href)').get() scraped_url = self.base_url + scraped_url section = menu_name subsection = heading category = lookup_category("", section, subsection) store_url(self.conn, scraped_url, self.store_id, category, section, subsection) elif (len(menu) == 0): inspect_response(response, self) finish_url(self.conn, self.store_id, url, True) print("finishing url - " + url) next_url = get_next_url(self.cursor, 1, self.store_id, True) if next_url is not None: print("got next_url - " + next_url) yield SplashRequest( next_url, self.parse, endpoint='execute', dont_filter=True, args={'lua_source': self.expand_and_scroll_lua}) else: print("Next url is none therefore we must be finished ! ")
def start_requests(self): url = self.start_urls[0] start_request = create_nocookies_request(url, self.handle_onboard, EC.element_to_be_clickable( (By.CSS_SELECTOR, '[data-automation-id="onboardingModalCloseBtn"]')), meta_url=url) store_url(self.conn, url, self.store_id, "Start", "", "") yield start_request
def parse_urls(self, response): location = response.css('[data-test="store-button"] ::text').get() self.driver = response.request.meta['driver'] location = self.driver.find_element_by_css_selector( '[data-test="store-button"]').text print(f"detected location - {location}") if location != self.location: self.change_store_location(response) self.section_group = response.css(".subcategory.category") section_group = response.css(".subcategory.category") for section in section_group: section_name = section.css(".css-1pita2n ::text").get() url_nodes = section.css("ul.children a") for url_node in url_nodes: subsection_name = url_node.css("::text").get() url = self.base_url + url_node.css("::attr(href)").get() store_url(self.conn, url, self.store_id, lookup_category("", section_name, subsection_name), section_name, subsection_name) finish_url(self.conn, self.store_id, response.url) function = self.parse item_to_find = '[add-to-cart]' if len(self.start_urls) != 0: next_url = self.start_urls.pop() store_url(self.conn, next_url, self.store_id, "", "", "") function = self.parse_urls item_to_find = '[data-test="store-button"]' #request = self.create_parse_request(next_url,self.parse_urls,EC.element_to_be_clickable((By.CSS_SELECTOR, '[data-test="store-button"]'))) else: next_url = get_next_url(self.cursor, 1) # request = self.create_parse_request(next_url,self.parse,EC.element_to_be_clickable((By.CSS_SELECTOR, '[add-to-cart]'))) if next_url is None: print("No more URLs to parse. Finishing") return else: request = self.create_parse_request( next_url, function, EC.element_to_be_clickable((By.CSS_SELECTOR, item_to_find))) #FIXME these try except blocks don't actually handle timeout exceptions from navigating to the wrong url try: yield request except: print(f"Parse - Errored out processing request for - {next_url} ") next_url = get_next_url(self.cursor, 2) print(f"Parse - Now handling {next_url}") request = self.create_parse_request( next_url, self.parse, EC.element_to_be_clickable((By.CSS_SELECTOR, '[add-to-cart]'))) yield request
def walk_menu(self, response): # inspect_response(response,self) self.driver = response.request.meta['driver'] self.logger.info('Inside walk_menu') start_url = self.driver.current_url menu_button = self.driver.find_element_by_css_selector( '[data-automation-id="NavigationBtn"]') menu_button.click() time.sleep(.5) departments = self.driver.find_elements_by_css_selector( '.NavigationPanel__department___1DF7d button') for department in departments: department_name = department.get_attribute('aria-label') department.click() time.sleep(.5) aisles = self.driver.find_elements_by_css_selector( '.NavigationPanel__aisleLink___309i2') for aisle in aisles: url = aisle.get_attribute('href') aisle_name = aisle.get_attribute('innerText') # self.department_name = department_name # self.aisle_name = aisle_name self.logger.info( f"department_name: {department_name}, aisle_name: {aisle_name}" ) category = lookup_category("", department_name, aisle_name) self.logger.info(f"Storing aisle: {aisle_name}, url: {url}") store_url(self.conn, url, self.store_id, category, department_name, aisle_name) finish_url(self.conn, self.store_id, start_url, scrape_urls=True) next_url = get_next_url(self.cursor, 1, store_id=self.store_id, scrape_urls=True, filter="aisle=") if next_url is None: self.logger.debug( "Next_url is None therefore we must be finished!") return self.next_url = next_url pagination_request = create_parse_request(next_url, self.handle_pagination, EC.element_to_be_clickable( (By.CSS_SELECTOR, self.PAGE_LOAD)), errback=self.retry, meta_url=next_url, cookies=False) yield pagination_request
def walk_through_pages(self, section, subsection): #print(f"walk_through_pages for {section},{subsection}") category = lookup_category("", section, subsection) start_url = self.driver.current_url try: #From here we should check if we are in a different menu next_arrow = self.driver.find_element_by_css_selector( '.next-arrow') except NoSuchElementException: return self.handle_click(next_arrow, self.delay) current_url = self.driver.current_url store_url(self.conn, current_url, self.store_id, category, section, subsection, self.get_quantity()) #Unfortunately we want to recurse until their is no more pages to walk through self.walk_through_pages(section, subsection)
def start_requests(self): self.store_id = find_store_id(self.cursor, self.store_name, self.location) if len(self.start_urls) != 0: url = self.start_urls.pop() store_url(self.conn, url, self.store_id, "", "", "") print(f"Starting requests with - {url}") wait_until = EC.element_to_be_clickable( (By.CSS_SELECTOR, '[data-test="store_button"]')) request = self.create_parse_request( url, self.parse_urls, EC.element_to_be_clickable( (By.CSS_SELECTOR, '[data-test="store-button"]'))) #request = self.create_parse_request(url,self.parse_urls,wait_until) yield request else: print("start_requests - len(start_urls) == 0 : exiting")
def handle_pagination(self, response): self.logger.info('inside handle_pagination') url = self.driver.current_url next_button = self.driver.find_elements_by_css_selector( self.NEXT_BUTTON_SELECTOR) # inspect_response(response,self) if len(next_button) != 0: next_page_url = get_next_pagination(self.PAGE_STRING, url) metadata = get_url_metadata(self.cursor, url) category = metadata[0] section = metadata[1] subsection = metadata[2] quantity = self.driver.find_element_by_css_selector( '.Title__browseTotalCount___OWylh').get_attribute('innerText') quantity = re.findall('[0-9]+', quantity)[0] store_url(self.conn, next_page_url, self.store_id, category, section, subsection, grocery_quantity=quantity) finish_url(self.conn, self.store_id, url, scrape_urls=True) next_url = get_next_url(self.cursor, 1, store_id=self.store_id, scrape_urls=True, filter="aisle=") if next_url is None: self.logger.debug( "Next_url is None therefore we must be finished!") return request = create_parse_request(next_url, self.handle_pagination, EC.element_to_be_clickable( (By.CSS_SELECTOR, self.PAGE_LOAD)), errback=self.retry, meta_url=next_url, cookies=False) yield request
def scrape_urls(self,response): mainGroups = response.css('.col-12.col-sm-12.col-md-4.col-lg-4.col-xl-3') #TODO can probably infer some categories from location for mainGroup in mainGroups: view_all = mainGroup.css('.text-uppercase.view-all-subcats ::attr(href)').get() view_all_url = self.base_url + view_all section = mainGroup.css('.product-title.text-uppercase ::text').get() section = section.strip() category = lookup_category("",section,"") #print (f"view_all_url - {view_all_url}, section - {section}, category - {category}") store_url(self.conn,view_all_url,self.store_id, category,section,"") siblingAisles = response.css('.siblingAisle') for siblingAisle in siblingAisles: href = siblingAisle.css('::attr(href)').get() siblingAisleUrl = self.base_url + href section = response.css('[aria-current="location"] ::text').get() section = section.strip() subsection = siblingAisle.css('::text').get() subsection = subsection.strip() category = lookup_category("",section,subsection) store_url(self.conn,siblingAisleUrl,self.store_id,category,section,subsection) # #check if it has a load-more button and then increment page number on it if response.css('.primary-btn.btn.btn-default.btn-secondary.bloom-load-button').get() is not None: path = response.css('[aria-current]:not(.menu-nav__sub-item) ::text').getall() #print(f"path - {path} for url - {response.url}") section = path[1] section = section.strip() subsection = path[-2] subsection = subsection.strip() category = lookup_category("",section,subsection) next_page_url=get_next_pagination(self.page_str,response.url) print (f'load-more-button. storing - {next_page_url}, section - {section}, subsection - {subsection}, category - {category}') store_url(self.conn,next_page_url,self.store_id,category,section,subsection)
def crawl_2nd_layer_menu(self, section, subsection): section_url = self.driver.current_url section_category = lookup_category("", section, subsection) #print (f"inside crawl_2nd_layer_menu for {section}:{subsection} with url - {section_url}") sections = self.driver.find_elements_by_css_selector( '#collapseOne > li > a') next_section = self.get_next_2nd_layer_section(section, subsection, sections) while next_section is not None: current_section = next_section section_text = current_section.get_attribute('innerText') #The trick here is that for 2nd layer sections is to append the layer2 info on the subsection subsection_text = subsection + ": " + section_text self.handle_click(current_section, self.delay) #print (f"subsection_text - {subsection_text}") current_url = self.driver.current_url category = lookup_category("", section, subsection_text) num_groceries = self.get_quantity() #We'll need to handle the pagination here, because we don't revisit this spot self.walk_through_pages(section, subsection_text) store_url(self.conn, current_url, self.store_id, category, section, subsection_text, num_groceries) sections = self.driver.find_elements_by_css_selector( '#collapseOne > li > a') next_section = self.get_next_2nd_layer_section( section, subsection, sections) #Store the section url after so we know we've completed it store_url(self.conn, section_url, self.store_id, section_category, section, subsection, self.get_quantity()) #We then need to click on the section header to get back outside the menu and continue on section_button = self.driver.find_element_by_css_selector( 'li.breadcrumb-item:nth-child(2) > span:nth-child(1) > a:nth-child(1)' ) self.handle_click(section_button, self.delay)
def start_requests(self): url = self.start_urls[0] start_request = create_unfiltered_parse_request( url, self.handle_onboard, EC.element_to_be_clickable( (By.CSS_SELECTOR, '[data-automation-id="onboardingModalCloseBtn"]')), errback=self.prompt_blocked, meta_url=url, cookies=False) store_url(self.conn, url, self.store_id, "Start", "", "") self.logger.info(f"about to call walk_menu with response.url: {url}") request = create_unfiltered_parse_request( url, self.walk_menu, EC.element_to_be_clickable( (By.CSS_SELECTOR, '[data-automation-id="NavigationBtn"]')), meta_url=url, cookies=False) yield start_request
def handle_pagination(self, response): # if it has a page-last class, read that content, and interprolate # else, get the last pager, page and interprolate self.logger.info("Inside handle_pagination") close_modal(self) change_store_location(self) base_url = response.url string_location=base_url.find(self.page_string) if string_location != -1: base_url = base_url[:string_location] pag_last = self.driver.find_elements_by_css_selector( '.pagination-last.pager-item') if pag_last: final_page_number = pag_last[0].text else: last_page = self.driver.find_elements_by_css_selector( '.pagination-page.pager-item')[-1] final_page_number = last_page.text final_page_number = int(final_page_number) metadata = get_url_metadata(self.cursor, base_url) category = metadata[0] section = metadata[1] subsection = metadata[2] for page_num in range(1, final_page_number+1): # Something like - # https://shop.wegmans.com/shop/categories/94 ?page= 13 page_url = base_url + self.page_string + str(page_num) store_url(self.conn, page_url, self.store_id, category, section, subsection) self.logger.info(f"finished handling pagination for {base_url}") finish_url(self.conn, self.store_id, response.url, scrape_urls=True) request = self.get_next_request() yield request
def crawl_submenu(self, response, section): ret = None subsections = self.driver.find_elements_by_css_selector( '#collapseOne > li > a') next_subsection = self.get_next_subsection(section, subsections) while next_subsection is not None: current_subsection = next_subsection subsection_text = current_subsection.get_attribute('innerText') self.handle_click(current_subsection, self.delay) try: #From here we should check if we are in a different menu clicked_element = self.driver.find_element_by_css_selector( '#collapseOne > li > span') except NoSuchElementException: clicked_element = None if clicked_element is None: #print(f"Now entered submenu for {subsection_text}") self.crawl_2nd_layer_menu(section, subsection_text) else: #print(f"Not in submenu for {subsection_text}") current_url = self.driver.current_url category = lookup_category("", section, subsection_text) num_groceries = self.get_quantity() self.walk_through_pages(section, subsection_text) store_url(self.conn, current_url, self.store_id, category, section, subsection_text, num_groceries) #inspect_response(response,self) #print (f"subsection_text - {subsection_text}") local_subsections = self.driver.find_elements_by_css_selector( '#collapseOne > li > a') next_subsection = self.get_next_subsection(section, local_subsections) return ret
def crawl_menu(self, response): self.driver = response.request.meta['driver'] actions = ActionChains(self.driver) print("inside crawl_menu") accept_cookies = self.driver.find_element_by_css_selector( '[title="Accept Cookies"]') self.handle_click(accept_cookies, self.delay) menu_button = self.driver.find_element_by_css_selector('.nav-open') self.handle_click(menu_button, self.delay) #We then need to scrape all of the (.'category-link') and then hover over each one and scrape the hrefs that appear sections = self.driver.find_elements_by_css_selector('.category-link') next_section = self.get_next_section(sections) self.section_list = sections #inspect_response(response,self) while next_section is not None: actions.move_to_element(next_section) section_name = next_section.get_attribute('innerText') print(f"using next_section: {section_name}") self.handle_click(next_section, self.delay) current_url = self.driver.current_url category = lookup_category("", section_name, "") #While on this page we need to click on all of the subsections self.crawl_submenu(response, section_name) #inspect_response(response,self) num_groceries = self.get_quantity() store_url(self.conn, current_url, self.store_id, category, section_name, "", num_groceries) # Now we need to reset it and do it again self.handle_click(menu_button, self.delay) sections = self.driver.find_elements_by_css_selector( '.category-link') next_section = self.get_next_section(sections) return
def scrape_urls(self, response): # FIXME the links for the hrefs default to 3132 then change to the correct 2635 mainGroups = response.css( '.col-12.col-sm-12.col-md-4.col-lg-4.col-xl-3') section = response.css('[aria-current="location"] ::text').get() if section is not None: section = section.strip() self.logger.info("Inside scrape_urls") #TODO can probably infer some categories from location for mainGroup in mainGroups: #self.logger.info (f"Using mainGroup - {mainGroup}") #It might be coming from here? it looks like the main categories are all having issues view_all = mainGroup.css( '.text-uppercase.view-all-subcats ::attr(href)').get() view_all_url = self.base_url + view_all view_all_url = self.replace_store_number(view_all_url) section = mainGroup.css( '.product-title.text-uppercase ::text').get() section = section.strip() category = lookup_category("", section, "") self.logger.info( f"view_all_url - {view_all_url}, section - {section}, category - {category}" ) store_url(self.conn, view_all_url, self.store_id, category, section, "") aisleCategories = response.css('a.aisle-category') for aisleCategory in aisleCategories: aisleName = aisleCategory.css( '::attr(data-aisle-name)').get().strip() aisleHref = aisleCategory.css('::attr(href)').get() aisleUrl = self.base_url + aisleHref aisleUrl = self.replace_store_number(aisleUrl) subsection = aisleName category = lookup_category("", section, subsection) self.logger.info( f"found aisleCategory with section - {section}, subsection - {subsection} " ) store_url(self.conn, aisleUrl, self.store_id, category, section, subsection) siblingAisles = response.css('.siblingAisle') for siblingAisle in siblingAisles: self.logger.info(f"using siblingAisle - {siblingAisle}") href = siblingAisle.css('::attr(href)').get() siblingAisleUrl = self.base_url + href siblingAisleUrl = self.replace_store_number(siblingAisleUrl) section = response.css('[aria-current="location"] ::text').get() section = section.strip() subsection = siblingAisle.css('::text').get() subsection = subsection.strip() category = lookup_category("", section, subsection) self.logger.info(f"siblingAisle storing: {siblinAisleUrl}") store_url(self.conn, siblingAisleUrl, self.store_id, category, section, subsection) # #check if it has a load-more button and then increment page number on it if response.css( '.primary-btn.btn.btn-default.btn-secondary.bloom-load-button' ).get() is not None: path = response.css( '[aria-current]:not(.menu-nav__sub-item) ::text').getall() #self.logger.info(f"path - {path} for url - {response.url}") section = path[1] section = section.strip() subsection = path[-2] subsection = subsection.strip() category = lookup_category("", section, subsection) next_page_url = get_next_pagination(self.page_str, response.url) next_page_url = self.replace_store_number(next_page_url) self.logger.info( f'load-more-button. storing - {next_page_url}, section - {section}, subsection - {subsection}, category - {category}' ) store_url(self.conn, next_page_url, self.store_id, category, section, subsection)
def parse(self, response): GROCERY_SELECTOR = '[data-automation-id="productTile"]' SPONSORED_SELECTOR = '[data-automation-id="sponsoredProductTile"]' GROCERIES_SELECTOR = GROCERY_SELECTOR + ',' + SPONSORED_SELECTOR NEXT_BUTTON = '[data-automation-id="nextButton"]' # Handle pagination url = response.url print (f"working on url - {url}") metadata=get_url_metadata(self.cursor,url) section=metadata[1] subsection=metadata[2] next_page=response.css(NEXT_BUTTON).get() if next_page is not None: #inspect_response(response,self) page_string="&page=" page_str_len=len(page_string) next_page_url=get_next_pagination(page_string,url) store_url(self.conn,next_page_url, self.store_id, lookup_category("",section,subsection) ,section, subsection) for grocery in response.css(GROCERIES_SELECTOR): NAME_SELECTOR = '[data-automation-id="name"] ::attr(name)' self.name = grocery.css(NAME_SELECTOR).extract_first() #parse the ounces off of the name decimal_regex = "([\d]+[.]?[\d]*|[.\d]+)" self.ounces = re.findall(decimal_regex + "\s*o(?:z|unces?)", self.name, re.IGNORECASE) self.pounds = re.findall(decimal_regex + "\s*(?:pound|lb)s?", self.name, re.IGNORECASE) self.count = re.findall("([\d]+)\s*(?:c(?:t|ount)|p(?:k|ack))", self.name, re.IGNORECASE) #Check if the arrays returned from re.findall are empty if self.ounces: self.ounces = parse_float(self.ounces[0]) else: self.ounces = 0 if self.pounds: self.pounds = parse_float(self.pounds[0]) else: self.pounds = 0 if self.count: self.count = parse_float(self.count[0]) else: self.count = 0 if self.pounds != 0: self.ounces = 16*self.pounds elif self.count != 0: self.ounces *= self.count # inspect_response(response,self) SALEPRICE_SELECTOR = '[data-automation-id="salePrice"] ::text' PRICE_SELECTOR = '[data-automation-id="price"] ::text' PRICE_PER_UNIT_SELECTOR = '[data-automation-id="price-per-unit"] ::text' name=grocery.css(NAME_SELECTOR).extract_first() name=clean_string(name,"\"") ounces=self.ounces pounds=self.pounds count=self.count price=str(handle_none(grocery.css(SALEPRICE_SELECTOR).extract_first())).replace('$','') ppu=convert_ppu(grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first()) url=response.url yield { 'name': name, 'ounces': ounces, 'pounds': pounds, 'count': count, 'price': price, 'price-per-unit': ppu, 'section': section, 'subsection': subsection, 'url': url, } finish_url(self.conn,self.store_id,url) next_url=get_next_url(self.cursor,1) print(f"next_url - {next_url}") if next_url is None: print ("No more urls - finishing") else: yield SplashRequest(next_url, self.parse, endpoint='render.html', args={ 'wait': 10, 'section': section, 'subsection': subsection })
def parse(self, response): url = response.url finish_url(self.conn, self.store_id, url) items = response.css('.cell-content-wrapper') metadata = get_url_metadata(self.cursor, url) section = metadata[1] subsection = metadata[2] #check if it has a next button, next_page = response.css('.pagination-next:not(.disabled)').get() if next_page is not None: #inspect_response(response,self) page_string = "?page=" page_str_len = len(page_string) i = url.find(page_string) #if yes, check url if it has a page part on it if i == -1: #if no, add ?page=2 to it next_url = url + page_string + "2" else: #if yes, extract page and add 1 page_number = i + page_str_len current_page = int(url[page_number:]) next_page = current_page + 1 next_url = url[:page_number] + str(next_page) #then add to self.urls store_url(self.conn, next_url, self.store_id, lookup_category("", section, subsection), section, subsection) for item in items: name = item.css('.cell-title-text ::text').get() name = clean_string(name, ['\"']) price = item.css('[data-test="amount"] .css-19m8h51 ::text').get() price = convert_dollars(price) quantity = item.css('[data-test="amount"] .css-cpy6p ::text').get() unit = item.css('.cell-product-size ::text').get() ounces = convert_to_ounces(unit) ppu = item.css('[data-test="per-unit-price"] ::text').get() ppu = convert_ppu(ppu) print( f"name - {name}, price - {price}, quantity - {quantity}, ounces - {ounces}, ppu - {ppu}, url - {url}, section - {section}, subsection - {subsection} " ) #inspect_response(response,self) yield { "name": name, "price": price, "ounces": ounces, "unit": unit, "price-per-unit": ppu, "url": url, "section": section, "subsection": subsection } next_url = get_next_url(self.cursor, 1) if next_url is None: print("No more URLs to parse. Finishing") return request = self.create_parse_request( next_url, self.parse, EC.element_to_be_clickable((By.CSS_SELECTOR, '[add-to-cart]'))) if next_url is not None: try: yield request except: print( f"Parse - Errored out processing request for - {next_url} " ) next_url = get_next_url(self.cursor, 2) print(f"Parse - Now handling {next_url}") request = self.create_parse_request( next_url, self.parse, EC.element_to_be_clickable( (By.CSS_SELECTOR, '[add-to-cart]'))) yield SeleniumRequest(url=next_url, callback=self.parse, wait_time=50, wait_until=EC.element_to_be_clickable( (By.CSS_SELECTOR, '.button.full.cart.add')))
def parse(self, response): # This callback determines if the selected menu is # at the top of the list, if it is then it adds the urls # to the list and keeps going # if its not, then it calls the lua to prepare the page # for scraping, and then scrapes it url = response.url menu = response.css(".category-filter__link") #submenu = response.css("") #print ("self.urls - " +str(self.urls)) print("processing response.url - " + response.url) #print ("menu: ") #print (menu.getall()) #print ("len(menu): " + str(len(menu))) #print ("menu[0] : " + menu.get()) #print("name - " + menu[0].css('.category-filter__text ::text').get()) #inspect_response(response,self) if (len(menu) > 0 and menu[0].css('[aria-current="page"]')): print(f"inside menu page for url - {url}") # The top page is active #print ("menu[0] : [aria-current=page] " + menu[0].css('[aria-current="page"]').get()) # therefore we need to scrape the links, and continue searching # we then need to loop through each other page. # call parse, and scrape it is not menu_url = menu[0].css('::attr(href)').get() menu_name = menu[0].css('.category-filter__text ::text').get() for item in menu: heading = item.css('.category-filter__text ::text').get() scraped_url = item.css('::attr(href)').get() scraped_url = self.base_url + scraped_url section = menu_name subsection = heading category = lookup_category("", section, subsection) store_url(self.conn, scraped_url, self.store_id, category, section, subsection) #self.section_dict[url]=(menu_name, heading) #if self.urls.count(url) == 0: # self.urls.append(url) #urls=menu.css('::attr(href)').getall() # Remove the the first(this) page from list to parse #urls.pop() #self.urls.extend(urls) #print("urls to scrape - " + str(self.urls)) #print("local urls - " + str(urls)) """ while len(self.urls) != 0: url = self.urls.pop() self.processedUrls.append(url) #url = self.base_url + url_suffix #print ("urls - " + str(self.urls)) #print ("pulling from url - " + url) #print ("urls lengths - " + str(len(self.urls))) yield SplashRequest(url, self.parse, endpoint='execute', args={'lua_source': self.expand_and_scroll_lua}) """ elif (len(menu) == 0): inspect_response(response, self) else: #we are on a subpage, so now we can start scraping # GROCERY_SELECTOR = '.grid-item' NAME_SELECTOR = '.small-type.detail-card-description ::text' PRICE_SELECTOR = '.price ::text' PRICE_PER_UNIT_SELECTOR = '.sub-headline.detail-card-subtext ::text' metadata = get_url_metadata(self.cursor, url) section = metadata[0] subsection = metadata[1] print("subpage - scraping " + url + ", from section - " + section) for grocery in response.css(GROCERY_SELECTOR): self.name = grocery.css(NAME_SELECTOR).extract_first() self.price = grocery.css(PRICE_SELECTOR).extract_first() if self.price is not None: self.price = self.price.replace('*', '').replace('$', '') self.ppu = grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first() if self.ppu is not None: self.ppu = convert_ppu(self.ppu) #inspect_response(response, self) #parse the ounces off of the name yield { 'name': self.name, 'price': self.price, 'price-per-unit': self.ppu, 'section': section, 'subsection': subsection, 'url': response.url } finish_url(self.conn, self.store_id, url) print("finishing url - " + url) next_url = get_next_url(self.cursor, 1) if next_url is not None: print("got next_url - " + next_url) yield SplashRequest( next_url, self.parse, endpoint='execute', dont_filter=True, args={'lua_source': self.expand_and_scroll_lua}) else: print("Next url is none therefore we must be finished ! ")