Пример #1
0
    def __init__(self, driver=None, base_url=None):

        if driver is not None:
            self.driver = driver
        else:
            try:
                from dryscrape.driver.webkit import Driver as DefaultDriver
                self.driver = DefaultDriver()
            except ImportError:
                raise ValueError('No driver instance can be created.')
        self.base_url = base_url
def get_prdouct_category_and_image(product_urls):
    img_cat_data = []
    count = 0
    for url in product_urls:
        # headers={"Accept" : "application/json, text/javascript, */*; q=0.01",
        #                                  "Referer": "https://www.blibli.com/p/canon-bg-e8-baterai-grip-original/ps--SUP-49229-00160?ds=SUP-49229-00160-00001&list=Product%20Listing%20Page",
        #                                  "Host": "www.blibli.com",
        #                                  "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36",
        #                                  "Accept-Encoding":"gzip, deflate, br",
        #                                  "Accept-Language":"en-IN,en-GB;q=0.9,en-US;q=0.8,en;q=0.7",
        #                                  "X-Requested-With":"XMLHttpRequest"
        #                                  }
        # headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36'}
        # r = requests.get(url,headers=headers)
        # print (r.content)
        print("crawling in progress -> {} ".format(count))
        print(url)
        session = dryscrape.Session(driver=Driver())
        session.visit(url)
        response = session.body()
        session.set_timeout(30)
        session.reset()
        soup = BeautifulSoup(response, 'lxml')
        print("data fetched")
        product_image_divs = soup.findAll(
            "div", {"class": "product__image-thumbnails--item"})

        product_imges = []
        for item in product_image_divs:
            image_tag = item.findAll("img")
            if len(image_tag) != 0:
                image_tag = image_tag[0]
                url = image_tag.get("src")
                url = url.replace("thumbnail", "full", 1)
                product_imges.append(url)

        # print("categoryies")
        category_divs = soup.findAll("div", {"class": "breadcrumb__block"})
        # print(category_divs)
        categories = []
        for category in category_divs:
            cat = category.findAll("a")[0].findAll("span")[0]
            categ = cat.text.encode("utf-8")
            categories.append(categ)

        image_category = {}
        image_category["image_urls"] = ",".join(product_imges)
        image_category["cat_label"] = "->".join(categories)
        img_cat_data.append(image_category)

        write_to_csv("image_data_set700_800", image_category)
        count = count + 1
    return img_cat_data
Пример #3
0
 def __init__(self,
              driver = None,
              base_url = None):
   
   if driver is not None:
     self.driver = driver
   else:
     try:
       from dryscrape.driver.webkit import Driver as DefaultDriver
       self.driver = DefaultDriver()
     except ImportError:
       raise ValueError('No driver instance can be created.')
   self.base_url = base_url
Пример #4
0
class Session(object):
  """ A web scraping session based on a driver instance. Implements the proxy
  pattern to pass unresolved method calls to the underlying driver.

  If no `driver` is specified, the instance will create an instance of
  ``dryscrape.session.DefaultDriver`` to get a driver instance (defaults to
  ``dryscrape.driver.webkit.Driver``).

  If `base_url` is present, relative URLs are completed with this URL base.
  If not, the `get_base_url` method is called on itself to get the base URL. """

  def __init__(self,
               driver = None,
               base_url = None):
    
    if driver is not None:
      self.driver = driver
    else:
      try:
        from dryscrape.driver.webkit import Driver as DefaultDriver
        self.driver = DefaultDriver()
      except ImportError:
        raise ValueError('No driver instance can be created.')
    self.base_url = base_url

  # implement proxy pattern
  def __getattr__(self, attr):
    """ Pass unresolved method calls to underlying driver. """
    return getattr(self.driver, attr)

  def visit(self, url):
    """ Passes through the URL to the driver after completing it using the
    instance's URL base. """
    return self.driver.visit(self.complete_url(url))

  def complete_url(self, url):
    """ Completes a given URL with this instance's URL base. """
    if self.base_url:
      return urlparse.urljoin(self.base_url, url)
    else:
      return url

  def interact(self, **local):
    """ Drops the user into an interactive Python session with the ``sess`` variable
    set to the current session instance. If keyword arguments are supplied, these
    names will also be available within the session. """
    import code
    code.interact(local=dict(sess=self, **local))
Пример #5
0
class Session(object):
    """ A web scraping session based on a driver instance. Implements the proxy
  pattern to pass unresolved method calls to the underlying driver.

  If no `driver` is specified, the instance will create an instance of
  ``dryscrape.session.DefaultDriver`` to get a driver instance (defaults to
  ``dryscrape.driver.webkit.Driver``).

  If `base_url` is present, relative URLs are completed with this URL base.
  If not, the `get_base_url` method is called on itself to get the base URL. """
    def __init__(self, driver=None, base_url=None):

        if driver is not None:
            self.driver = driver
        else:
            try:
                from dryscrape.driver.webkit import Driver as DefaultDriver
                self.driver = DefaultDriver()
            except ImportError:
                raise ValueError('No driver instance can be created.')
        self.base_url = base_url

    # implement proxy pattern
    def __getattr__(self, attr):
        """ Pass unresolved method calls to underlying driver. """
        return getattr(self.driver, attr)

    def visit(self, url):
        """ Passes through the URL to the driver after completing it using the
    instance's URL base. """
        return self.driver.visit(self.complete_url(url))

    def complete_url(self, url):
        """ Completes a given URL with this instance's URL base. """
        if self.base_url:
            return urlparse.urljoin(self.base_url, url)
        else:
            return url

    def interact(self, **local):
        """ Drops the user into an interactive Python session with the ``sess`` variable
    set to the current session instance. If keyword arguments are supplied, these
    names will also be available within the session. """
        import code
        code.interact(local=dict(sess=self, **local))