def next_page_url(self, url): self.pc += 1 imitate_user(0.5) next_url = url if self.page_url: next_url += self.page_url next_url += str(self.pc) if self.pc == self.depth_limit: self.run = False # recursion limit reached return next_url
def get_page(self, url): try: print("Getting %s" % url) self.driver.get(url) # self.driver.get_cookies() except ValueError as e: imitate_user(5) try: self.driver.get(url) except: raise except Exception as e: print(url, e) page = BeautifulSoup(self.driver.page_source, "lxml") return page
def get_page(self, url): try: imitate_user(0.2) self.logger.info("Getting %s" % url) self.driver.get(url) except ValueError as e: imitate_user(2) try: self.driver.get(url) except: raise except Exception as e: self.logger.error(url, e) page = BeautifulSoup(self.driver.page_source, "lxml") return page
def next_page_url(self, url): """ method to build the next url to be fetched. Expects a pagination parameter in the url and builds it iteratively. pc is the control parameter. :param url: :return: """ self.pc += 1 imitate_user(1) next_url = url if self.page_url: next_url += self.page_url # next_url += str(self.pc) if self.pc == 1: # this should be the last param self.run = False # recursion control return next_url
def get_list(self, page): if page.find_all("div", {"class": "nme"}): entries = page.find_all("div", {"class": "nme"}) elif page.find_all("div", {"class":"rdetails"}): entries = page.find_all("div", {"class": "rdetails"}) elif page.find_all("div", {"class": "info"}): entries = page.find_all("div", {"class": "info"}) # entries = page.find_all("div", {"class":"listing"}) for e in entries: if len(e) == 1: continue elif e.name == "script": continue else: imitate_user(1) entry = {} entry = self.get_detail(e.a.attrs['href'], entry) if entry: self.process_output(entry)
def get_page(self, url): try: self.logger.info("Getting %s" % url) self.driver.get(url) # self.driver.get_cookies() except ValueError as e: imitate_user(2) try: self.driver.get(url) except: raise except Exception as e: self.logger.error(url, e) # try: # wait = WebDriverWait(self.driver, 3) # wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div"))) # except Exception as e: # self.logger.error("WebDriverWait error") page = BeautifulSoup(self.driver.page_source, "lxml") return page
def get_list(self, page): """ method takes search results page from Walmart and parses out items to save. Has error checking for no search results or an empty set. :param page: bs4 object returned from get_page :return: """ imitate_user(0.5) if page.find(string=re.compile(r'We found 0 results')): self.run = False return elif not page.find("ul", {"class": "tile-list-grid"}): self.run = False return else: entries = page.find("ul", {"class": "tile-list-grid"}) for e in entries: if len(e) == 1: continue elif e.name == "script": continue else: entry = {} try: entry['title'] = e.find("a", {"class":"js-product-title"}).get_text().strip() except: continue if 'http://' in e.find("a", {"class":"js-product-title"}).attrs['href']: entry['url'] = e.find("a", {"class":"js-product-title"}).attrs['href'] else: entry['url'] = "".join((self.base_url, e.find("a", {"class":"js-product-title"}).attrs['href'])) try: entry['price'] = e.find("span", {"class":"price-display"}).get_text().replace('$', '') except: continue entry['img'] = e.find("img", {"class":"product-image"}).attrs['data-default-image'] entry['item_id'] = e.find("div", {"class": "js-tile", "class": "tile-grid-unit"}).attrs['data-item-id'] #entry['az_price'], entry['weight'], entry['az_sales_rank'], entry['az_match'], entry['az_url'], entry['az_asin'] = self.az.find_best_match(entry['title'], 'Toys') #entry['net'] = self.get_net(entry) #entry['roi'] = self.get_roi(entry) self.process_output(entry)