Пример #1
0
    def start_requests(self):
        print("inside start_requests")

        scrape_url_request = create_parse_request(
            self.start_urls[0], self.crawl_menu,
            EC.element_to_be_clickable((By.CSS_SELECTOR, '.nav-open')))
        yield scrape_url_request
Пример #2
0
    def start_requests(self):

        location_request = create_parse_request(
            self.start_urls[0], self.check_location,
            EC.element_to_be_clickable(
                (By.CSS_SELECTOR, '#openFulfillmentModalButton')))
        yield location_request
Пример #3
0
    def walk_menu(self, response):
        # inspect_response(response,self)
        self.driver = response.request.meta['driver']
        self.logger.info('Inside walk_menu')
        start_url = self.driver.current_url
        menu_button = self.driver.find_element_by_css_selector(
            '[data-automation-id="NavigationBtn"]')
        menu_button.click()

        time.sleep(.5)

        departments = self.driver.find_elements_by_css_selector(
            '.NavigationPanel__department___1DF7d button')
        for department in departments:
            department_name = department.get_attribute('aria-label')
            department.click()
            time.sleep(.5)
            aisles = self.driver.find_elements_by_css_selector(
                '.NavigationPanel__aisleLink___309i2')
            for aisle in aisles:
                url = aisle.get_attribute('href')
                aisle_name = aisle.get_attribute('innerText')
                # self.department_name = department_name
                # self.aisle_name = aisle_name
                self.logger.info(
                    f"department_name: {department_name}, aisle_name: {aisle_name}"
                )
                category = lookup_category("", department_name, aisle_name)
                self.logger.info(f"Storing aisle: {aisle_name}, url: {url}")
                store_url(self.conn, url, self.store_id, category,
                          department_name, aisle_name)

        finish_url(self.conn, self.store_id, start_url, scrape_urls=True)
        next_url = get_next_url(self.cursor,
                                1,
                                store_id=self.store_id,
                                scrape_urls=True,
                                filter="aisle=")
        if next_url is None:
            self.logger.debug(
                "Next_url is None therefore we must be finished!")
            return

        self.next_url = next_url
        pagination_request = create_parse_request(next_url,
                                                  self.handle_pagination,
                                                  EC.element_to_be_clickable(
                                                      (By.CSS_SELECTOR,
                                                       self.PAGE_LOAD)),
                                                  errback=self.retry,
                                                  meta_url=next_url,
                                                  cookies=False)

        yield pagination_request
Пример #4
0
    def parse(self, response):
        page_1_str=self.page_str+"1"
        this_url = trim_url(response.url,page_1_str)
        print (f"inside parse for {this_url}")
        self.scrape_urls(response)

        # Only scrape pages that have the page_str in the url.
        if this_url.find(self.page_str) != -1:
            print (f"scraping for {this_url}")
            items = response.css('product-item-v2')
            print(f"length of items - {len(items)}")
            metadata=get_url_metadata(self.cursor,this_url)
            section=metadata[1]
            subsection=metadata[2]
            for item in items:
                name = item.css('.product-title ::text').get()
                price_strings = item.css('.product-price ::text').getall()
                price = clean_string(price_strings[-1],['$'])
                ppu = item.css('.product-price-qty ::text').get()
                unit = self.collect_units(name)
                #inspect_response(response,self)

                if unit == "OZ" or unit == "LB":
                    ounces = self.collect_ounces(name)
                else:
                    ounces = 0
                print (f"yielding - {name}, {price}, {ppu}, {ounces}, {unit}")
                yield{
                  "name": name,
                  "price": price,
                  "ounces": ounces,
                  "unit": unit,
                  "price-per-unit": ppu,
                  "url": this_url,
                  "section": section,
                  "subsection": subsection
                }

        #Basically the website redirects us to the url and page_1_str, which isn't added to our database
        # So we trim that off so we can get the url in our database
        finish_url(self.conn,self.store_id,this_url)
        print("finishing url - " + this_url)
        next_url = get_next_url(self.cursor, 1)
        if next_url is None:
            print ("Next url is none therefore we must be finished ! ")
            return
        else:
            next_request = create_parse_request(next_url,
                                                self.check_location,
                                                EC.element_to_be_clickable((By.CSS_SELECTOR,'#openFulfillmentModalButton')))
        print(f"got next_url - {next_url}")
        yield next_request
Пример #5
0
    def get_next_request(self):
        #next_url = get_next_url(self.cursor, 1, store_id=self.store_id,
        #                        scrape_urls=True,filter=self.page_string,reverse_filter=True)
        next_url = get_next_url(self.cursor, 1, store_id=self.store_id,
                        scrape_urls=True)

        request = create_parse_request(next_url,
                                       self.handle_pagination,
                                       EC.element_to_be_clickable(
                                           (By.CSS_SELECTOR, '.pagination-page.pager-item')),
                                       meta_url=next_url,
                                       errback=self.no_pagination
                                       )
        return request

#    def close_modal(self):
#        close_button = self.driver.find_elements_by_css_selector(
#            '#shopping-selector-parent-process-modal-close-click')
#        if close_button:
#            self.logger.info("Closing modal")
#            close_button[0].click()
#            time.sleep(.5)
#        else:
#            self.logger.info("No Modal detected continuing")
#
#    def change_store_location(self):
#        store_button = self.driver.find_element_by_css_selector(
#            '[data-test="store-button"]')
#        current_store = store_button.text
#        if current_store == self.location:
#            self.logger.info(
#                f"Current location = {current_store} is correct. Continuing.")
#            return
#        store_button.click()
#        time.sleep(self.delay)
#        stores = self.driver.find_elements_by_css_selector('.store-row')
#        # Go through each of the stores, until one matches the text, then click on it
#        for store in stores:
#            name = store.find_element_by_css_selector('.name')
#            store_name = name.text
#            #print (f"change_store_location - {name.text}")
#            if store_name == self.location:
#                button = store.find_element_by_css_selector(
#                    '[data-test="select-store-button"]')
#                button.click()
#                time.sleep(self.delay)
#                self.logger.info(f"Set location to {store_name}")
#                return

        self.logger.warn(f"Could not set location to {self.location}")
Пример #6
0
 def handle_onboard(self, response):
     self.driver = response.request.meta['driver']
     self.logger.info("Handling Onboard modal")
     close_button = self.driver.find_element_by_css_selector(
         '[data-automation-id="onboardingModalCloseBtn"]')
     close_button.click()
     time.sleep(.5)
     next_url = get_next_url(self.cursor, 1, store_id=self.store_id,
                             scrape_urls=False, filter="aisle=")
     if next_url is None:
         self.logger.debug(
                 "Next_url is None therefore we must be finished!")
         return
     request = create_parse_request(next_url,
                                    self.parse,
                                    EC.element_to_be_clickable(
                                   (By.CSS_SELECTOR, '[aria-current="page"]')),
                                    meta_url=next_url)
     yield request
Пример #7
0
    def handle_pagination(self, response):
        self.logger.info('inside handle_pagination')
        url = self.driver.current_url
        next_button = self.driver.find_elements_by_css_selector(
            self.NEXT_BUTTON_SELECTOR)
        # inspect_response(response,self)
        if len(next_button) != 0:
            next_page_url = get_next_pagination(self.PAGE_STRING, url)
            metadata = get_url_metadata(self.cursor, url)
            category = metadata[0]
            section = metadata[1]
            subsection = metadata[2]
            quantity = self.driver.find_element_by_css_selector(
                '.Title__browseTotalCount___OWylh').get_attribute('innerText')
            quantity = re.findall('[0-9]+', quantity)[0]
            store_url(self.conn,
                      next_page_url,
                      self.store_id,
                      category,
                      section,
                      subsection,
                      grocery_quantity=quantity)

        finish_url(self.conn, self.store_id, url, scrape_urls=True)
        next_url = get_next_url(self.cursor,
                                1,
                                store_id=self.store_id,
                                scrape_urls=True,
                                filter="aisle=")
        if next_url is None:
            self.logger.debug(
                "Next_url is None therefore we must be finished!")
            return
        request = create_parse_request(next_url,
                                       self.handle_pagination,
                                       EC.element_to_be_clickable(
                                           (By.CSS_SELECTOR, self.PAGE_LOAD)),
                                       errback=self.retry,
                                       meta_url=next_url,
                                       cookies=False)
        yield request
Пример #8
0
 def start_requests(self):
     print("inside start_requests")
     ADD_TO_CART_SELECTOR = '#product-main > div.forlistview-right > span > a.btn.btn-primary'
     #SELECTOR = '.smart-filter > h2:nth-child(1)'
     #SIDEFILTER_SELECTOR='.sidefilter-title'
     next_url = get_next_url(self.cursor, 1)
     while next_url is not None:
         current_url = next_url
         scrape_url_request = create_parse_request(
             current_url, self.parse,
             EC.element_to_be_clickable(
                 (By.CSS_SELECTOR, ADD_TO_CART_SELECTOR)))
         #scrape_url_request = create_parse_request(current_url,self.parse,EC.visibility_of_element_located((By.CSS_SELECTOR,SELECTOR)))
         i = 1
         # This try except isnt really doing anything. Somehow I need to catch this and handle it elsewhere?
         try:
             yield scrape_url_request
         except (NoSuchWindowException, TimeoutException) as e:
             print(f"Handling exception {e} ")
             i += 1
         next_url = get_next_url(self.cursor, i)
Пример #9
0
    def get_next_request(self, attempt=1):
        #TODO think about adding in a url filter
        #next_url = get_next_url(self.cursor, 1, store_id=self.store_id,
        #                        filter=self.page_string)
        next_url = get_next_url(self.cursor, 1, store_id=self.store_id)

        if next_url is None:
            self.logger.info(
                "Could not find any more urls, therefore we must be finished!")
            return None

        request = create_parse_request(next_url,
                                       self.parse,
                                       EC.element_to_be_clickable(
                                           (By.CSS_SELECTOR,
                                            '[data-test="product-cell"]')),
                                       meta_url=next_url,
                                       errback=self.retry_page,
                                       filter=False,
                                       attempt=attempt)

        return request
Пример #10
0
    def handle_onboard(self, response):
        self.driver = response.request.meta['driver']
        url = response.url
        self.logger.info("Handling Onboard modal")
        self.close_modal()
        self.change_location(response)
        self.close_modal()
        self.logger.info(f"about to call walk_menu with response.url: {url}")
        request = create_unfiltered_parse_request(
            response.url,
            self.walk_menu,
            EC.element_to_be_clickable(
                (By.CSS_SELECTOR, '[data-automation-id="NavigationBtn"]')),
            errback=self.prompt_blocked,
            meta_url=response.url,
            cookies=False)

        if is_url_scraped(self.cursor, url, self.store_id, scrape_urls=True):
            next_url = get_next_url(self.cursor,
                                    1,
                                    store_id=self.store_id,
                                    scrape_urls=True,
                                    filter="aisle=")
            if next_url is None:
                self.logger.debug(
                    "Next_url is None therefore we must be finished!")
                return
            request = create_parse_request(next_url,
                                           self.handle_pagination,
                                           EC.element_to_be_clickable(
                                               (By.CSS_SELECTOR,
                                                self.PAGE_LOAD)),
                                           errback=self.retry,
                                           meta_url=next_url,
                                           cookies=False)
        self.logger.info(f"About to yield request: {request}")
        yield request
Пример #11
0
    def parse(self, response):
        url = response.url
        self.logger.info(f"Inside parse for {url}")

        GROCERY_SELECTOR = '[data-automation-id="productTile"]'
        SPONSORED_SELECTOR = '[data-automation-id="sponsoredProductTile"]'
        GROCERIES_SELECTOR = GROCERY_SELECTOR + ',' + SPONSORED_SELECTOR
        metadata=get_url_metadata(self.cursor,url)
        section=metadata[1]
        subsection=metadata[2]

        for grocery in response.css(GROCERIES_SELECTOR):
            NAME_SELECTOR = '[data-automation-id="name"] ::attr(name)'
            name = grocery.css(NAME_SELECTOR).extract_first()
            #parse the ounces off of the name
            decimal_regex = "([\d]+[.]?[\d]*|[.\d]+)"
            ounces = re.findall(decimal_regex + "\s*o(?:z|unces?)",
                                     name, re.IGNORECASE)
            pounds = re.findall(decimal_regex + "\s*(?:pound|lb)s?",
                                     name, re.IGNORECASE)
            count = re.findall("([\d]+)\s*(?:c(?:t|ount)|p(?:k|ack))",
                                    name, re.IGNORECASE)
            self.ounce = ounces
            self.pounds = pounds
            self.count = count
            #Check if the arrays returned from re.findall are empty
            if ounces:
                ounces = parse_float(ounces[0])
            else:
                ounces = 0
            if pounds:
                pounds = parse_float(pounds[0])
            else:
                pounds = 0
            if count:
                count = parse_float(count[0])
            else:
                count = 0

            if pounds != 0:
                ounces = 16*pounds
            elif count != 0:
                ounces *= count

            #            inspect_response(response,self)
            SALEPRICE_SELECTOR = '[data-automation-id="salePrice"] ::text'
            PRICE_SELECTOR = '[data-automation-id="price"] ::text'
            PRICE_PER_UNIT_SELECTOR = '[data-automation-id="price-per-unit"] ::text'

            name=grocery.css(NAME_SELECTOR).extract_first()
            name=clean_string(name,"\"")
            ounces=ounces
            pounds=pounds
            count=count
            price=str(handle_none(grocery.css(SALEPRICE_SELECTOR).extract_first())).replace('$','')
            ppu=convert_ppu(grocery.css(PRICE_PER_UNIT_SELECTOR).extract_first())

            yield {
                'name': name,
                'ounces': ounces,
                'pounds': pounds,
                'count': count,
                'price': price,
                'price-per-unit': ppu,
                'section': section,
                'subsection': subsection,
                'url': url,
            }

        finish_url(self.conn,self.store_id,url)
        next_url=get_next_url(self.cursor,1,store_id=self.store_id,filter="aisle=")

        print(f"next_url - {next_url}")
        if next_url is None:
            print ("No more urls - finishing")
        else:
            request = create_parse_request(next_url,
                                           self.parse,
                                           EC.element_to_be_clickable(
                                          (By.CSS_SELECTOR, '[aria-current="page"]')),
                                           meta_url=next_url)
            yield request