Python imitate_user示例，HTTPutils.imitate_user Python示例

示例#1

0

显示文件

文件： walmartProdSearch.py 项目： smehan/py-webetl

 def next_page_url(self, url):
     self.pc += 1
     imitate_user(0.5)
     next_url = url
     if self.page_url:
         next_url += self.page_url
         next_url += str(self.pc)
     if self.pc == self.depth_limit:
         self.run = False  # recursion limit reached
     return next_url

示例#2

0

显示文件

文件： BBBscraper.py 项目： smehan/py-webetl

 def get_page(self, url):
     try:
         print("Getting %s" % url)
         self.driver.get(url)
         # self.driver.get_cookies()
     except ValueError as e:
         imitate_user(5)
         try:
             self.driver.get(url)
         except:
             raise
     except Exception as e:
         print(url, e)
     page = BeautifulSoup(self.driver.page_source, "lxml")
     return page

示例#3

0

显示文件

文件： Wikipedia.py 项目： smehan/py-webetl

 def get_page(self, url):
     try:
         imitate_user(0.2)
         self.logger.info("Getting %s" % url)
         self.driver.get(url)
     except ValueError as e:
         imitate_user(2)
         try:
             self.driver.get(url)
         except:
             raise
     except Exception as e:
         self.logger.error(url, e)
     page = BeautifulSoup(self.driver.page_source, "lxml")
     return page

示例#4

0

显示文件

文件： BBBscraper.py 项目： smehan/py-webetl

 def next_page_url(self, url):
     """
     method to build the next url to be fetched. Expects a pagination parameter in the url
     and builds it iteratively. pc is the control parameter.
     :param url:
     :return:
     """
     self.pc += 1
     imitate_user(1)
     next_url = url
     if self.page_url:
         next_url += self.page_url
     # next_url += str(self.pc)
     if self.pc == 1:  # this should be the last param
         self.run = False  # recursion control
     return next_url

示例#5

0

显示文件

文件： BBBscraper.py 项目： smehan/py-webetl

 def get_list(self, page):
     if page.find_all("div", {"class": "nme"}):
         entries = page.find_all("div", {"class": "nme"})
     elif page.find_all("div", {"class":"rdetails"}):
         entries = page.find_all("div", {"class": "rdetails"})
     elif page.find_all("div", {"class": "info"}):
         entries = page.find_all("div", {"class": "info"})
     # entries = page.find_all("div", {"class":"listing"})
     for e in entries:
         if len(e) == 1:
             continue
         elif e.name == "script":
             continue
         else:
             imitate_user(1)
             entry = {}
             entry = self.get_detail(e.a.attrs['href'], entry)
             if entry:
                 self.process_output(entry)

示例#6

0

显示文件

文件： walmartProdSearch.py 项目： smehan/py-webetl

 def get_page(self, url):
     try:
         self.logger.info("Getting %s" % url)
         self.driver.get(url)
         # self.driver.get_cookies()
     except ValueError as e:
         imitate_user(2)
         try:
             self.driver.get(url)
         except:
             raise
     except Exception as e:
         self.logger.error(url, e)
     # try:
     #     wait = WebDriverWait(self.driver, 3)
     #     wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div")))
     # except Exception as e:
     #     self.logger.error("WebDriverWait error")
     page = BeautifulSoup(self.driver.page_source, "lxml")
     return page

示例#7

0

显示文件

文件： walmartProdSearch.py 项目： smehan/py-webetl

 def get_list(self, page):
     """
     method takes search results page from Walmart and parses out items to save.
     Has error checking for no search results or an empty set.
     :param page: bs4 object returned from get_page
     :return:
     """
     imitate_user(0.5)
     if page.find(string=re.compile(r'We found 0 results')):
         self.run = False
         return
     elif not page.find("ul", {"class": "tile-list-grid"}):
         self.run = False
         return
     else:
         entries = page.find("ul", {"class": "tile-list-grid"})
     for e in entries:
         if len(e) == 1:
             continue
         elif e.name == "script":
             continue
         else:
             entry = {}
             try:
                 entry['title'] = e.find("a", {"class":"js-product-title"}).get_text().strip()
             except:
                 continue
             if 'http://' in e.find("a", {"class":"js-product-title"}).attrs['href']:
                 entry['url'] = e.find("a", {"class":"js-product-title"}).attrs['href']
             else:
                 entry['url'] = "".join((self.base_url, e.find("a", {"class":"js-product-title"}).attrs['href']))
             try:
                 entry['price'] = e.find("span", {"class":"price-display"}).get_text().replace('$', '')
             except:
                 continue
             entry['img'] = e.find("img", {"class":"product-image"}).attrs['data-default-image']
             entry['item_id'] = e.find("div", {"class": "js-tile", "class": "tile-grid-unit"}).attrs['data-item-id']
             #entry['az_price'], entry['weight'], entry['az_sales_rank'], entry['az_match'], entry['az_url'], entry['az_asin'] = self.az.find_best_match(entry['title'], 'Toys')
             #entry['net'] = self.get_net(entry)
             #entry['roi'] = self.get_roi(entry)
             self.process_output(entry)