Пример #1
0
 def __init__(self):
     '''
     初始化当前深度, 正常情况下是从0开始
     '''
     logger.info("init process~")
     print u"=== 当前进度为:"
     self.deep = 0
     self._pbar = click.progressbar(length=1, label="deep 0 : ")
Пример #2
0
 def __init__(self):
     '''
     初始化当前深度, 正常情况下是从0开始
     '''
     logger.info("init process~")
     print u"=== 当前进度为:"
     self.deep = 0
     self._pbar = click.progressbar(length=1, label="deep 0 : ")
Пример #3
0
 def _finish_pbar(self):
     '''
     结束上一个进度条
     :return:
     '''
     logger.info("deep %s over!" % (self.deep - 1))
     self._pbar.finish()
     self._pbar.render_progress()
     self._pbar.render_finish()
Пример #4
0
 def _finish_pbar(self):
     '''
     结束上一个进度条
     :return:
     '''
     logger.info("deep %s over!" %(self.deep-1))
     self._pbar.finish()
     self._pbar.render_progress()
     self._pbar.render_finish()
Пример #5
0
   def run(self):
       """
          Take jobs in the queue (url to query), parse it and save results
       """
       global linksVisited
       global websiteIndex
       global logger

       dataLock = self.dataLock

       while True:

           # Get the job(link) from the queue and parse link
           queueItem = self.queue.get()
           currentLink = urlparse(queueItem)

           # Make sure link wasn't already visited and add it to the list of visited links
           with dataLock:
               if currentLink.path in linksVisited:
                   self.queue.task_done()
                   continue
               linksVisited.add(currentLink.path)

           logger.info("Thread with id " + str(self.id) + " starts crawling " + currentLink.path)

           # Query page - Add some headers so that websites such as Monzo.com aren't afraid and answer ;)
           try:
                req = urllib.request.Request(urllib.parse.urljoin(self.domain, currentLink.path), headers={'User-Agent': 'Mozilla/5.0'})
                webPage = urllib.request.urlopen( req )
           except urllib.error.HTTPError as e:
                # Whoops it wasn't a 200
                logger.error("Error - Thread with id " + str(self.id) + " while crawling " + urllib.parse.urljoin(self.domain, currentLink.path) + ": " + str(e))
                self.queue.task_done()
                continue

           # Create instance of HTML parser
           try:
               htmlParser = Parser(self.domain)
               htmlParser.feed(str(webPage.read()))
               htmlParser.close()
           except UnboundLocalError:
                logger.error("Error - Thread with id " + str(self.id) + " while parsing " + urllib.parse.urljoin(self.domain, currentLink.path))
                self.queue.task_done()
                continue

           # Find remaining links to visit (again syncrhonised so that link aren't handled twice)
           with self.dataLock:
               # Save links and assets as a tuple
               websiteIndex[currentLink.path] = (htmlParser.links,htmlParser.staticAssets)
               linksNotVisitedYet = htmlParser.links.difference(linksVisited)

           # Add links to visit
           for link in linksNotVisitedYet:
               self.queue.put(link)

           self.queue.task_done()
Пример #6
0
 def build_thread_pool(self, f):
     '''
     创建线程池
     :param f: 函数(非通用,仅支持单参数)
     :return: 线程池列表
     '''
     logger.info("build thread pool, and the number is " + str(self.num))
     for x in xrange(self.num):
         pool = SiThread(self._queue, self._result, f)
         pool.start()
         self.pool.append(pool)
Пример #7
0
    def _del(self):
        global stop

        logger.info("destory thread pool")
        for x in self.pool:
            self._queue.put('stop')
        for t in self.pool:
            t.join()
        self._dbqueue.put('stop')
        self.dbt.join()
        logger.debug("destory thread pool suc")
Пример #8
0
    def run(self):
        global stop

        while True:
            r = self._queue.get()
            if r == 'stop':
                logger.info("database thread be stoped")
                break
            if r['type'] == 'html':
                operate['db'].insert(r['html'], r['url'])
            else:
                logger.warn("not a html page")
Пример #9
0
    def run(self):
        global stop

        logger.debug("run a thread")
        while True:
            url = self._queue.get()
            if url == 'stop':
                logger.info(
                    str(threading.currentThread().ident) + " be stoped")
                break
            r = self._f(url)
            self._result.put(r)
Пример #10
0
 def _run(self, length, queue):
     '''
     启动方法
     :param length: 当前深度爬取的URL长度
     :return: None
     '''
     self.deep += 1
     logger.info("begin run deep %s process bar, the length is %s" % (self.deep, length))
     label = "deep %s: " %(self.deep)
     self._queue = queue
     self.length = length
     # 结束上一个进度条
     self._finish_pbar()
     # 新建一个进度条
     self._pbar = click.progressbar(length=length, label=label, show_percent=False, show_pos=True)
     # 每隔10s显示一次进度
     self.t = threading.Thread(target=self.timer)
     self.t.start()
Пример #11
0
 def __init__(self, num, func):
     '''
     self._queue 参数队列
     self._result 返回结果队列
     self.pool 线程池
     :param num: 线程数
     :param func: 函数
     '''
     logger.info("init a thread pool class")
     self._queue = Queue.Queue()
     self._result = Queue.Queue()
     self._dbqueue = Queue.Queue()
     self.pool = []
     self.num = num
     # 线程池
     self.build_thread_pool(func)
     # insert db 线程
     self.dbt = DbThread(self._dbqueue)
     self.dbt.start()
     # 显示进度
     self.process_bar = ShowProcess()
Пример #12
0
 def _run(self, length, queue):
     '''
     启动方法
     :param length: 当前深度爬取的URL长度
     :return: None
     '''
     self.deep += 1
     logger.info("begin run deep %s process bar, the length is %s" %
                 (self.deep, length))
     label = "deep %s: " % (self.deep)
     self._queue = queue
     self.length = length
     # 结束上一个进度条
     self._finish_pbar()
     # 新建一个进度条
     self._pbar = click.progressbar(length=length,
                                    label=label,
                                    show_percent=False,
                                    show_pos=True)
     # 每隔10s显示一次进度
     self.t = threading.Thread(target=self.timer)
     self.t.start()
Пример #13
0
 def __init__(self, queue1, queue2, f):
     threading.Thread.__init__(self)
     self._queue = queue1
     self._result = queue2
     self._f = f
     logger.info('init a thread')