Exemplo n.º 1
0
 def addUrl(self, url):
     with self._lock:
         if url not in self.visited:
             self.count += 1
             self.visit.append(url)
     Log4Spider.debugLog(self, "add a url[[[", url, "]]]",
                         "current size:[[[", self.count, "]]]")
Exemplo n.º 2
0
 def addUrl(self, url):
     if url in self.visited:
         pass
     else:
         self.visit.put(url)
         Log4Spider.infoLog(self, "add a url[[[", url, "]]]",
                            "current size:[[[", self.visit.qsize(), "]]]")
Exemplo n.º 3
0
    def add_handler(self, url_handler):
        """Appends the given url_handlers to our handler list.
        """
        print(url_handler)
        url_pattern = url_handler[0]
        if not url_pattern.endswith("$"):
            url_pattern += "$"
        handlers = []
        #wildcard .*$ should have lowest priority
        #notice:first we only insert a empty handlers as a placeholder
        if self.handlers and self.handlers[-1][0].pattern == '.*$':
            self.handlers.insert(-1, (re.compile(url_pattern), handlers))
        else:
            self.handlers.append((re.compile(url_pattern), handlers))

        spec = url_handler
        if isinstance(
                spec,
            (tuple, list)):  #the url_handler should be inited with some args
            assert len(spec) in (2, 3, 4)
            spec = URLSpec(*spec)
        handlers.append(spec)
        if spec.name:
            if spec.name and self.named_handlers:
                Log4Spider.warnLog(
                    "Multiple handlers named %s; replacing previous value",
                    spec.name)
            self.named_handlers[spec.name] = spec
Exemplo n.º 4
0
    def add_handler(self,url_handler):
        """Appends the given url_handlers to our handler list.
        """
        print(url_handler)
        url_pattern = url_handler[0]
        if not url_pattern.endswith("$"):
            url_pattern+="$"
        handlers = []
        #wildcard .*$ should have lowest priority
        #notice:first we only insert a empty handlers as a placeholder
        if self.handlers and self.handlers[-1][0].pattern == '.*$':
            self.handlers.insert(-1,(re.compile(url_pattern),handlers))
        else:
            self.handlers.append((re.compile(url_pattern),handlers))

        spec = url_handler
        if isinstance(spec,(tuple,list)): #the url_handler should be inited with some args
            assert len(spec) in (2,3,4)
            spec = URLSpec(*spec)
        handlers.append(spec)
        if spec.name:
            if spec.name and self.named_handlers:
                Log4Spider.warnLog(
                    "Multiple handlers named %s; replacing previous value",
                    spec.name
                )
            self.named_handlers[spec.name] = spec
 def fetch_one_url(url):
     env_obj = SpiderEnv(url)
     env = yield env_obj.gen_env()
     urlSeek = Jd_Home_Spider(env,app)
     yield urlSeek.work()
     for url in urlSeek.urlLists:
        Log4Spider.infoLog(url)
     Log4Spider.infoLog(len(urlSeek.urlLists))
Exemplo n.º 6
0
 def work(self):
     exec = self.app.executor
     try:
         driver = webdriver.PhantomJS(executable_path="/usr/bin/phantomjs")
         yield exec.submit(driver.get, self.env['url'])
         yield self.scrapy(driver)
     except Exception as e:
         Log4Spider.errLog(self, "webdriver.PhantomJS failed: ", e)
Exemplo n.º 7
0
 def work(self):
     exec = self.app.executor
     try:
         driver = webdriver.PhantomJS(executable_path="/usr/bin/phantomjs")
         yield exec.submit(driver.get,self.env['url'])
         yield self.scrapy(driver)
     except Exception as e:
         Log4Spider.errLog(self,"webdriver.PhantomJS failed: ",e)
Exemplo n.º 8
0
 def fetch_one_url(url):
     env_obj = SpiderEnv(url)
     env = yield env_obj.gen_env()
     urlSeek = BaseSpider(env, app)
     yield urlSeek.work()
     for url in urlSeek.urlLists:
         Log4Spider.infoLog(url)
     Log4Spider.infoLog(len(urlSeek.urlLists))
Exemplo n.º 9
0
 def prepare_cul_opts(obj):
     parse = urlparse(self.env['url'])
     path = parse.path
     pic_name = parse.netloc + path.replace("/", "-")
     static_path = self.app.settings["static_path"]
     if not os.path.exists(static_path):
         os.mkdir(static_path)
     pic_path = "%s/%s" % (self.app.settings["static_path"],
                           pic_name)
     Log4Spider.warnLog("PicDown path: ", pic_path)
     obj.setopt(pycurl.WRITEFUNCTION, open(pic_path, "wb").write)
Exemplo n.º 10
0
 def fetch_one_url(url):
     env_obj = SpiderEnv(url)
     env = yield env_obj.gen_env()
     urlSeek = Jd_Item_Spider(env,app)
     yield urlSeek.work()
     for url in urlSeek.urlLists:
        Log4Spider.infoLog(url)
     Log4Spider.infoLog(len(urlSeek.urlLists))
     global num
     num-=1
     if num == 0:
         event.set()
Exemplo n.º 11
0
 def main():
     for url in [
             "http://www.jianshu.com",
             "http://upload-images.jianshu.io/upload_images/1679702-7e810a34f3ef8d18.jpg?imageMogr2/auto-orient/strip%7CimageView2/1/w/300/h/300"
     ]:
         env_obj = SpiderEnv(url)
         env = yield env_obj.gen_env()
         urlSeek = UrlSeekSpider(env, None)
         yield urlSeek.work()
         for url in urlSeek.urlLists:
             Log4Spider.infoLog(url)
         Log4Spider.infoLog(len(urlSeek.urlLists))
Exemplo n.º 12
0
 def fetch_one_url(url):
     env_obj = SpiderEnv(url)
     env = yield env_obj.gen_env()
     urlSeek = Jd_Item_Spider(env, app)
     yield urlSeek.work()
     for url in urlSeek.urlLists:
         Log4Spider.infoLog(url)
     Log4Spider.infoLog(len(urlSeek.urlLists))
     global num
     num -= 1
     if num == 0:
         event.set()
Exemplo n.º 13
0
 def scrapy(self,driver):
     bs = BeautifulSoup(driver.page_source)
     item = bs.find("div",id="product-intro").find("div",id="itemInfo")
     name = item.find("div",id="name").find("h1").text
     price = item.find("div",id="summary-price").find("strong",class_="p-price").text
     #print(price)
     price = re_price.search(price).group()
     discount = item.find("div",id="summary-price").find("span",class_="p-discount").text
     #print(discount)
     discount = re_price.search(discount).group()
     #print(name,price,discount)
     Log4Spider.dataLog("insert a shop",name,price,discount)
     self.db.shops.insert({"name":name,"price":price,"discount":discount})
     super().scrapy(driver)
Exemplo n.º 14
0
 def getUrlBySoup(self, soup):
     a_tags = soup.find_all()
     for a_tag in a_tags:
         attrs = a_tag.attrs
         for attr in attrs:
             if attr in (
                     'href', 'src', '#src', '#src2'
             ):  #find a url,some url likes javascript:void(null) are not filter
                 url = url_path = a_tag[attr]
                 if url_path.startswith("//"):
                     url_path = "http:" + url_path
                 if url_path.startswith("http:"):
                     self._url_lists.append(url_path)
                 else:
                     Log4Spider.errLog("Find a unknown url:[[[", url, "]]]")
Exemplo n.º 15
0
    def scrapy(self, driver):
        exec = self.app.executor

        #css_div = 'div[class="%s"] h3'%('item-inner')
        #div_book = yield exec.submit(driver.find_elements_by_css_selector,css_div)
        #for h in div_book:
         #   ActionChains(driver).move_to_element(h).perform()
        pagesource = driver.page_source
        bs = BeautifulSoup(pagesource)
        eles = bs.findAll("div",{"class":"p-name"})
        prices = bs.findAll("div",{"class":"p-price"})
        for price in prices:
            i = price.find("i")
            if i:
                print(i.text)
        for ele,price in zip(eles,prices):
            Log4Spider.dataLog(ele.text,price.text)
        super().scrapy(driver)
Exemplo n.º 16
0
    def scrapy(self, driver):
        exec = self.app.executor

        #css_div = 'div[class="%s"] h3'%('item-inner')
        #div_book = yield exec.submit(driver.find_elements_by_css_selector,css_div)
        #for h in div_book:
        #   ActionChains(driver).move_to_element(h).perform()
        pagesource = driver.page_source
        bs = BeautifulSoup(pagesource)
        eles = bs.findAll("div", {"class": "p-name"})
        prices = bs.findAll("div", {"class": "p-price"})
        for price in prices:
            i = price.find("i")
            if i:
                print(i.text)
        for ele, price in zip(eles, prices):
            Log4Spider.dataLog(ele.text, price.text)
        super().scrapy(driver)
Exemplo n.º 17
0
 def _urlWork(self):
     AsyncHTTPClient.configure(CurlAsyncHTTPClient)
     httpCli = AsyncHTTPClient()
     try:
         respone = yield httpCli.fetch(self.env['url'])
     except Exception as e:
         Log4Spider.errLog("urlWork fetch url: ", self.env['url'],
                           "error exception: ", e)
         return
     soup = BeautifulSoup(respone.body)
     a_tags = soup.find_all()
     for a_tag in a_tags:
         attrs = a_tag.attrs
         for attr in attrs:
             Log4Spider.debugLog("tag: ", a_tag, "attr:", attr)
             if attr in (
                     'href', 'src', '#src', '#src2'
             ):  #find a url,some url likes javascript:void(null) are not filter
                 url = url_path = a_tag[attr]
                 url_path = url_path.replace("//", "/")
                 if url_path.startswith("/"):
                     url_parse = self.env['urlparse']
                     url = urlunparse([
                         url_parse.scheme, url_parse.netloc, url_path, "",
                         "", ""
                     ])
                 if url.startswith("http"):
                     if not self.parse_url_own or url_parse.netloc in url:
                         self._url_lists.append(url)
                 else:
                     Log4Spider.errLog("Find a unknown url:[[[", url, "]]]")
Exemplo n.º 18
0
 def scrapy(self, driver):
     bs = BeautifulSoup(driver.page_source)
     item = bs.find("div", id="product-intro").find("div", id="itemInfo")
     name = item.find("div", id="name").find("h1").text
     price = item.find("div",
                       id="summary-price").find("strong",
                                                class_="p-price").text
     #print(price)
     price = re_price.search(price).group()
     discount = item.find("div",
                          id="summary-price").find("span",
                                                   class_="p-discount").text
     #print(discount)
     discount = re_price.search(discount).group()
     #print(name,price,discount)
     Log4Spider.dataLog("insert a shop", name, price, discount)
     self.db.shops.insert({
         "name": name,
         "price": price,
         "discount": discount
     })
     super().scrapy(driver)
Exemplo n.º 19
0
    def realWork(self):
        if self.env['mine'][1] in ('jpg', 'jpeg', 'png', 'gif'):
            AsyncHTTPClient.configure(CurlAsyncHTTPClient)

            def prepare_cul_opts(obj):
                parse = urlparse(self.env['url'])
                path = parse.path
                pic_name = parse.netloc + path.replace("/", "-")
                static_path = self.app.settings["static_path"]
                if not os.path.exists(static_path):
                    os.mkdir(static_path)
                pic_path = "%s/%s" % (self.app.settings["static_path"],
                                      pic_name)
                Log4Spider.warnLog("PicDown path: ", pic_path)
                obj.setopt(pycurl.WRITEFUNCTION, open(pic_path, "wb").write)

            httpCli = AsyncHTTPClient()
            try:
                respone = yield httpCli.fetch(
                    self.env['url'], prepare_curl_callback=prepare_cul_opts)
            except Exception as e:
                Log4Spider.errLog("PicDown failed url: ", self.env['url'],
                                  "error exception: ", e)
                return
Exemplo n.º 20
0
    def run(self):
        while True:
            url = yield self.queue.get()
            Log4Spider.debugLog(self,"get url:",url)
            try:
                env = yield SpiderEnv(url).gen_env()
            except Exception as e:
                Log4Spider.errLog(self,"spider env failed url:",url,"exception:",e)
                continue

            self._find_url_handler(url)
            Log4Spider.infoLog(self,"url: ",url," --- class: ",self.handler_class)
            spider = self.handler_class(env,self.application,**self.handler_kwargs)
            yield spider.work()
            for url in spider.urlLists:
                    Log4Spider.debugLog(self,"put url:",url)
                    yield self.queue.put(url)
Exemplo n.º 21
0
    def run(self):
        while True:
            url = yield self.queue.get()
            Log4Spider.debugLog(self, "get url:", url)
            try:
                env = yield SpiderEnv(url).gen_env()
            except Exception as e:
                Log4Spider.errLog(self, "spider env failed url:", url,
                                  "exception:", e)
                continue

            self._find_url_handler(url)
            Log4Spider.infoLog(self, "url: ", url, " --- class: ",
                               self.handler_class)
            spider = self.handler_class(env, self.application,
                                        **self.handler_kwargs)
            yield spider.work()
            for url in spider.urlLists:
                Log4Spider.debugLog(self, "put url:", url)
                yield self.queue.put(url)
Exemplo n.º 22
0
def main():
    settings = {
                "host": "localhost",
                "port": 6379,
                "db": 0
    }

    try:
        client = motor.MotorClient("mongodb://localhost:27017/")
        database = client['jd_db']
    except:
        print("mongodb init failed")
        sys.exit(0)

    app_settings = {
        "static_path":"static",
        "database":database
    }

    app = Application([
        (r"^http://www\.jd\.com", "anger6Spider.spiders.jd_spiders.Jd_Home_Spider"),
        (r"^http://list\.jd\.com.*", "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"),
        (r"^http://channel\.jd\.com.*", "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"),
        (r"^http://search\.jd\.com/search?.*", "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"),
        (r"^http://item\.jd\.com.*", "anger6Spider.spiders.jd_spiders.Jd_Item_Spider"),
    ],**app_settings)

    cocurrency = 20



    from anger6Spider.spiderQueue.redisqueue import RedisQueue
    queue = RedisQueue(**settings)
    queue._create_redis_cli()
    #yield queue.put("http://www.jianshu.com")
    #yield queue.put("http://www.jd.com")
    #yield queue.put("http://www.ivsky.com")
    #yield queue.put("http://www.jd.com")

    workers = []
    for _ in range(cocurrency):
        workers.append(Worker(app,queue))

    for worker in workers:
        Log4Spider.debugLog("worker begin:",worker)
        worker.run()

    Log4Spider.debugLog("waitiing for spiderQueue empty:")
    yield queue.join(None)
    Log4Spider.debugLog("main done!")
Exemplo n.º 23
0
def main():
    settings = {"host": "localhost", "port": 6379, "db": 0}

    try:
        client = motor.MotorClient("mongodb://localhost:27017/")
        database = client['jd_db']
    except:
        print("mongodb init failed")
        sys.exit(0)

    app_settings = {"static_path": "static", "database": database}

    app = Application([
        (r"^http://www\.jd\.com",
         "anger6Spider.spiders.jd_spiders.Jd_Home_Spider"),
        (r"^http://list\.jd\.com.*",
         "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"),
        (r"^http://channel\.jd\.com.*",
         "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"),
        (r"^http://search\.jd\.com/search?.*",
         "anger6Spider.spiders.jd_spiders.Jd_BaseSpider"),
        (r"^http://item\.jd\.com.*",
         "anger6Spider.spiders.jd_spiders.Jd_Item_Spider"),
    ], **app_settings)

    cocurrency = 20

    from anger6Spider.spiderQueue.redisqueue import RedisQueue
    queue = RedisQueue(**settings)
    queue._create_redis_cli()
    #yield queue.put("http://www.jianshu.com")
    #yield queue.put("http://www.jd.com")
    #yield queue.put("http://www.ivsky.com")
    #yield queue.put("http://www.jd.com")

    workers = []
    for _ in range(cocurrency):
        workers.append(Worker(app, queue))

    for worker in workers:
        Log4Spider.debugLog("worker begin:", worker)
        worker.run()

    Log4Spider.debugLog("waitiing for spiderQueue empty:")
    yield queue.join(None)
    Log4Spider.debugLog("main done!")
Exemplo n.º 24
0
 def main():
     parse = HtmlFetch(base_url)
     result = yield parse.fetch()
     Log4Spider.infoLog(result)
Exemplo n.º 25
0
 def main():
     parse = HtmlFetch(base_url)
     result = yield parse.fetch()
     Log4Spider.infoLog(result)
Exemplo n.º 26
0
 def addUrl(self,url):
     if url in self.visited:
         pass
     else:
         self.visit.put(url)
         Log4Spider.infoLog(self,"add a url[[[",url,"]]]","current size:[[[",self.visit.qsize(),"]]]")
Exemplo n.º 27
0
 def addUrl(self,url):
     with self._lock:
         if url not in self.visited:
             self.count+=1
             self.visit.append(url)
     Log4Spider.debugLog(self,"add a url[[[",url,"]]]","current size:[[[",self.count,"]]]")