def start_requests(self): print("inside start_requests") scrape_url_request = create_parse_request( self.start_urls[0], self.crawl_menu, EC.element_to_be_clickable((By.CSS_SELECTOR, '.nav-open'))) yield scrape_url_request
def start_requests(self): location_request = create_parse_request( self.start_urls[0], self.check_location, EC.element_to_be_clickable( (By.CSS_SELECTOR, '#openFulfillmentModalButton'))) yield location_request
def walk_menu(self, response): # inspect_response(response,self) self.driver = response.request.meta['driver'] self.logger.info('Inside walk_menu') start_url = self.driver.current_url menu_button = self.driver.find_element_by_css_selector( '[data-automation-id="NavigationBtn"]') menu_button.click() time.sleep(.5) departments = self.driver.find_elements_by_css_selector( '.NavigationPanel__department___1DF7d button') for department in departments: department_name = department.get_attribute('aria-label') department.click() time.sleep(.5) aisles = self.driver.find_elements_by_css_selector( '.NavigationPanel__aisleLink___309i2') for aisle in aisles: url = aisle.get_attribute('href') aisle_name = aisle.get_attribute('innerText') # self.department_name = department_name # self.aisle_name = aisle_name self.logger.info( f"department_name: {department_name}, aisle_name: {aisle_name}" ) category = lookup_category("", department_name, aisle_name) self.logger.info(f"Storing aisle: {aisle_name}, url: {url}") store_url(self.conn, url, self.store_id, category, department_name, aisle_name) finish_url(self.conn, self.store_id, start_url, scrape_urls=True) next_url = get_next_url(self.cursor, 1, store_id=self.store_id, scrape_urls=True, filter="aisle=") if next_url is None: self.logger.debug( "Next_url is None therefore we must be finished!") return self.next_url = next_url pagination_request = create_parse_request(next_url, self.handle_pagination, EC.element_to_be_clickable( (By.CSS_SELECTOR, self.PAGE_LOAD)), errback=self.retry, meta_url=next_url, cookies=False) yield pagination_request
def parse(self, response): page_1_str=self.page_str+"1" this_url = trim_url(response.url,page_1_str) print (f"inside parse for {this_url}") self.scrape_urls(response) # Only scrape pages that have the page_str in the url. if this_url.find(self.page_str) != -1: print (f"scraping for {this_url}") items = response.css('product-item-v2') print(f"length of items - {len(items)}") metadata=get_url_metadata(self.cursor,this_url) section=metadata[1] subsection=metadata[2] for item in items: name = item.css('.product-title ::text').get() price_strings = item.css('.product-price ::text').getall() price = clean_string(price_strings[-1],['$']) ppu = item.css('.product-price-qty ::text').get() unit = self.collect_units(name) #inspect_response(response,self) if unit == "OZ" or unit == "LB": ounces = self.collect_ounces(name) else: ounces = 0 print (f"yielding - {name}, {price}, {ppu}, {ounces}, {unit}") yield{ "name": name, "price": price, "ounces": ounces, "unit": unit, "price-per-unit": ppu, "url": this_url, "section": section, "subsection": subsection } #Basically the website redirects us to the url and page_1_str, which isn't added to our database # So we trim that off so we can get the url in our database finish_url(self.conn,self.store_id,this_url) print("finishing url - " + this_url) next_url = get_next_url(self.cursor, 1) if next_url is None: print ("Next url is none therefore we must be finished ! ") return else: next_request = create_parse_request(next_url, self.check_location, EC.element_to_be_clickable((By.CSS_SELECTOR,'#openFulfillmentModalButton'))) print(f"got next_url - {next_url}") yield next_request
def get_next_request(self): #next_url = get_next_url(self.cursor, 1, store_id=self.store_id, # scrape_urls=True,filter=self.page_string,reverse_filter=True) next_url = get_next_url(self.cursor, 1, store_id=self.store_id, scrape_urls=True) request = create_parse_request(next_url, self.handle_pagination, EC.element_to_be_clickable( (By.CSS_SELECTOR, '.pagination-page.pager-item')), meta_url=next_url, errback=self.no_pagination ) return request # def close_modal(self): # close_button = self.driver.find_elements_by_css_selector( # '#shopping-selector-parent-process-modal-close-click') # if close_button: # self.logger.info("Closing modal") # close_button[0].click() # time.sleep(.5) # else: # self.logger.info("No Modal detected continuing") # # def change_store_location(self): # store_button = self.driver.find_element_by_css_selector( # '[data-test="store-button"]') # current_store = store_button.text # if current_store == self.location: # self.logger.info( # f"Current location = {current_store} is correct. Continuing.") # return # store_button.click() # time.sleep(self.delay) # stores = self.driver.find_elements_by_css_selector('.store-row') # # Go through each of the stores, until one matches the text, then click on it # for store in stores: # name = store.find_element_by_css_selector('.name') # store_name = name.text # #print (f"change_store_location - {name.text}") # if store_name == self.location: # button = store.find_element_by_css_selector( # '[data-test="select-store-button"]') # button.click() # time.sleep(self.delay) # self.logger.info(f"Set location to {store_name}") # return self.logger.warn(f"Could not set location to {self.location}")
def handle_onboard(self, response): self.driver = response.request.meta['driver'] self.logger.info("Handling Onboard modal") close_button = self.driver.find_element_by_css_selector( '[data-automation-id="onboardingModalCloseBtn"]') close_button.click() time.sleep(.5) next_url = get_next_url(self.cursor, 1, store_id=self.store_id, scrape_urls=False, filter="aisle=") if next_url is None: self.logger.debug( "Next_url is None therefore we must be finished!") return request = create_parse_request(next_url, self.parse, EC.element_to_be_clickable( (By.CSS_SELECTOR, '[aria-current="page"]')), meta_url=next_url) yield request
def handle_pagination(self, response): self.logger.info('inside handle_pagination') url = self.driver.current_url next_button = self.driver.find_elements_by_css_selector( self.NEXT_BUTTON_SELECTOR) # inspect_response(response,self) if len(next_button) != 0: next_page_url = get_next_pagination(self.PAGE_STRING, url) metadata = get_url_metadata(self.cursor, url) category = metadata[0] section = metadata[1] subsection = metadata[2] quantity = self.driver.find_element_by_css_selector( '.Title__browseTotalCount___OWylh').get_attribute('innerText') quantity = re.findall('[0-9]+', quantity)[0] store_url(self.conn, next_page_url, self.store_id, category, section, subsection, grocery_quantity=quantity) finish_url(self.conn, self.store_id, url, scrape_urls=True) next_url = get_next_url(self.cursor, 1, store_id=self.store_id, scrape_urls=True, filter="aisle=") if next_url is None: self.logger.debug( "Next_url is None therefore we must be finished!") return request = create_parse_request(next_url, self.handle_pagination, EC.element_to_be_clickable( (By.CSS_SELECTOR, self.PAGE_LOAD)), errback=self.retry, meta_url=next_url, cookies=False) yield request
def start_requests(self): print("inside start_requests") ADD_TO_CART_SELECTOR = '#product-main > div.forlistview-right > span > a.btn.btn-primary' #SELECTOR = '.smart-filter > h2:nth-child(1)' #SIDEFILTER_SELECTOR='.sidefilter-title' next_url = get_next_url(self.cursor, 1) while next_url is not None: current_url = next_url scrape_url_request = create_parse_request( current_url, self.parse, EC.element_to_be_clickable( (By.CSS_SELECTOR, ADD_TO_CART_SELECTOR))) #scrape_url_request = create_parse_request(current_url,self.parse,EC.visibility_of_element_located((By.CSS_SELECTOR,SELECTOR))) i = 1 # This try except isnt really doing anything. Somehow I need to catch this and handle it elsewhere? try: yield scrape_url_request except (NoSuchWindowException, TimeoutException) as e: print(f"Handling exception {e} ") i += 1 next_url = get_next_url(self.cursor, i)
def get_next_request(self, attempt=1): #TODO think about adding in a url filter #next_url = get_next_url(self.cursor, 1, store_id=self.store_id, # filter=self.page_string) next_url = get_next_url(self.cursor, 1, store_id=self.store_id) if next_url is None: self.logger.info( "Could not find any more urls, therefore we must be finished!") return None request = create_parse_request(next_url, self.parse, EC.element_to_be_clickable( (By.CSS_SELECTOR, '[data-test="product-cell"]')), meta_url=next_url, errback=self.retry_page, filter=False, attempt=attempt) return request
def handle_onboard(self, response): self.driver = response.request.meta['driver'] url = response.url self.logger.info("Handling Onboard modal") self.close_modal() self.change_location(response) self.close_modal() self.logger.info(f"about to call walk_menu with response.url: {url}") request = create_unfiltered_parse_request( response.url, self.walk_menu, EC.element_to_be_clickable( (By.CSS_SELECTOR, '[data-automation-id="NavigationBtn"]')), errback=self.prompt_blocked, meta_url=response.url, cookies=False) if is_url_scraped(self.cursor, url, self.store_id, scrape_urls=True): next_url = get_next_url(self.cursor, 1, store_id=self.store_id, scrape_urls=True, filter="aisle=") if next_url is None: self.logger.debug( "Next_url is None therefore we must be finished!") return request = create_parse_request(next_url, self.handle_pagination, EC.element_to_be_clickable( (By.CSS_SELECTOR, self.PAGE_LOAD)), errback=self.retry, meta_url=next_url, cookies=False) self.logger.info(f"About to yield request: {request}") yield request
def parse(self, response): url = response.url self.logger.info(f"Inside parse for {url}") GROCERY_SELECTOR = '[data-automation-id="productTile"]' SPONSORED_SELECTOR = '[data-automation-id="sponsoredProductTile"]' GROCERIES_SELECTOR = GROCERY_SELECTOR + ',' + SPONSORED_SELECTOR metadata=get_url_metadata(self.cursor,url) section=metadata[1] subsection=metadata[2] for grocery in response.css(GROCERIES_SELECTOR): NAME_SELECTOR = '[data-automation-id="name"] ::attr(name)' name = grocery.css(NAME_SELECTOR).extract_first() #parse the ounces off of the name decimal_regex = "([\d]+[.]?[\d]*|[.\d]+)" ounces = re.findall(decimal_regex + "\s*o(?:z|unces?)", name, re.IGNORECASE) pounds = re.findall(decimal_regex + "\s*(?:pound|lb)s?", name, re.IGNORECASE) count = re.findall("([\d]+)\s*(?:c(?:t|ount)|p(?:k|ack))", name, re.IGNORECASE) self.ounce = ounces self.pounds = pounds self.count = count #Check if the arrays returned from re.findall are empty if ounces: ounces = parse_float(ounces[0]) else: ounces = 0 if pounds: pounds = parse_float(pounds[0]) else: pounds = 0 if count: count = parse_float(count[0]) else: count = 0 if pounds != 0: ounces = 16*pounds elif count != 0: ounces *= count # inspect_response(response,self) SALEPRICE_SELECTOR = '[data-automation-id="salePrice"] ::text' PRICE_SELECTOR = '[data-automation-id="price"] ::text' PRICE_PER_UNIT_SELECTOR = '[data-automation-id="price-per-unit"] ::text' name=grocery.css(NAME_SELECTOR).extract_first() name=clean_string(name,"\"") ounces=ounces pounds=pounds count=count price=str(handle_none(grocery.css(SALEPRICE_SELECTOR).extract_first())).replace('$','') ppu=convert_ppu(grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first()) yield { 'name': name, 'ounces': ounces, 'pounds': pounds, 'count': count, 'price': price, 'price-per-unit': ppu, 'section': section, 'subsection': subsection, 'url': url, } finish_url(self.conn,self.store_id,url) next_url=get_next_url(self.cursor,1,store_id=self.store_id,filter="aisle=") print(f"next_url - {next_url}") if next_url is None: print ("No more urls - finishing") else: request = create_parse_request(next_url, self.parse, EC.element_to_be_clickable( (By.CSS_SELECTOR, '[aria-current="page"]')), meta_url=next_url) yield request