Exemplo n.º 1
0
 def _urlWork(self):
     AsyncHTTPClient.configure(CurlAsyncHTTPClient)
     httpCli = AsyncHTTPClient()
     try:
         respone = yield httpCli.fetch(self.env['url'])
     except Exception as e:
         Log4Spider.errLog("urlWork fetch url: ", self.env['url'],
                           "error exception: ", e)
         return
     soup = BeautifulSoup(respone.body)
     a_tags = soup.find_all()
     for a_tag in a_tags:
         attrs = a_tag.attrs
         for attr in attrs:
             Log4Spider.debugLog("tag: ", a_tag, "attr:", attr)
             if attr in (
                     'href', 'src', '#src', '#src2'
             ):  #find a url,some url likes javascript:void(null) are not filter
                 url = url_path = a_tag[attr]
                 url_path = url_path.replace("//", "/")
                 if url_path.startswith("/"):
                     url_parse = self.env['urlparse']
                     url = urlunparse([
                         url_parse.scheme, url_parse.netloc, url_path, "",
                         "", ""
                     ])
                 if url.startswith("http"):
                     if not self.parse_url_own or url_parse.netloc in url:
                         self._url_lists.append(url)
                 else:
                     Log4Spider.errLog("Find a unknown url:[[[", url, "]]]")
Exemplo n.º 2
0
 def work(self):
     exec = self.app.executor
     try:
         driver = webdriver.PhantomJS(executable_path="/usr/bin/phantomjs")
         yield exec.submit(driver.get,self.env['url'])
         yield self.scrapy(driver)
     except Exception as e:
         Log4Spider.errLog(self,"webdriver.PhantomJS failed: ",e)
Exemplo n.º 3
0
 def work(self):
     exec = self.app.executor
     try:
         driver = webdriver.PhantomJS(executable_path="/usr/bin/phantomjs")
         yield exec.submit(driver.get, self.env['url'])
         yield self.scrapy(driver)
     except Exception as e:
         Log4Spider.errLog(self, "webdriver.PhantomJS failed: ", e)
Exemplo n.º 4
0
 def getUrlBySoup(self, soup):
     a_tags = soup.find_all()
     for a_tag in a_tags:
         attrs = a_tag.attrs
         for attr in attrs:
             if attr in (
                     'href', 'src', '#src', '#src2'
             ):  #find a url,some url likes javascript:void(null) are not filter
                 url = url_path = a_tag[attr]
                 if url_path.startswith("//"):
                     url_path = "http:" + url_path
                 if url_path.startswith("http:"):
                     self._url_lists.append(url_path)
                 else:
                     Log4Spider.errLog("Find a unknown url:[[[", url, "]]]")
Exemplo n.º 5
0
    def run(self):
        while True:
            url = yield self.queue.get()
            Log4Spider.debugLog(self,"get url:",url)
            try:
                env = yield SpiderEnv(url).gen_env()
            except Exception as e:
                Log4Spider.errLog(self,"spider env failed url:",url,"exception:",e)
                continue

            self._find_url_handler(url)
            Log4Spider.infoLog(self,"url: ",url," --- class: ",self.handler_class)
            spider = self.handler_class(env,self.application,**self.handler_kwargs)
            yield spider.work()
            for url in spider.urlLists:
                    Log4Spider.debugLog(self,"put url:",url)
                    yield self.queue.put(url)
Exemplo n.º 6
0
    def run(self):
        while True:
            url = yield self.queue.get()
            Log4Spider.debugLog(self, "get url:", url)
            try:
                env = yield SpiderEnv(url).gen_env()
            except Exception as e:
                Log4Spider.errLog(self, "spider env failed url:", url,
                                  "exception:", e)
                continue

            self._find_url_handler(url)
            Log4Spider.infoLog(self, "url: ", url, " --- class: ",
                               self.handler_class)
            spider = self.handler_class(env, self.application,
                                        **self.handler_kwargs)
            yield spider.work()
            for url in spider.urlLists:
                Log4Spider.debugLog(self, "put url:", url)
                yield self.queue.put(url)
Exemplo n.º 7
0
    def realWork(self):
        if self.env['mine'][1] in ('jpg', 'jpeg', 'png', 'gif'):
            AsyncHTTPClient.configure(CurlAsyncHTTPClient)

            def prepare_cul_opts(obj):
                parse = urlparse(self.env['url'])
                path = parse.path
                pic_name = parse.netloc + path.replace("/", "-")
                static_path = self.app.settings["static_path"]
                if not os.path.exists(static_path):
                    os.mkdir(static_path)
                pic_path = "%s/%s" % (self.app.settings["static_path"],
                                      pic_name)
                Log4Spider.warnLog("PicDown path: ", pic_path)
                obj.setopt(pycurl.WRITEFUNCTION, open(pic_path, "wb").write)

            httpCli = AsyncHTTPClient()
            try:
                respone = yield httpCli.fetch(
                    self.env['url'], prepare_curl_callback=prepare_cul_opts)
            except Exception as e:
                Log4Spider.errLog("PicDown failed url: ", self.env['url'],
                                  "error exception: ", e)
                return