Exemplo n.º 1
0
    def robot_task_tacker(self):

        self.logger.info("start task tacker...")

        from captcha.http_client import HTTPClient
        import time

        client = HTTPClient()
        while True:
            try:
                data = client.get("http://emopad.sinaapp.com/robot/app/task_list")

                self.logger.info("tracker new task:%s" % data)

                if data.strip():
                    t = new_task("webrobot")
                    for line in data.split("\n"):
                        t.add_action(line.strip())
                    t.close()
                    t.status = "waiting"
                time.sleep(30)

                from sailing.conf import settings as CONFIG

                task_list = FileTask.search(CONFIG.APP_NAME, "done", len=15)

                for t in task_list:
                    self.logger.info("remove done task:%s" % t._id)
                    # t.remove()

            except:
                pass
Exemplo n.º 2
0
    def robot_task_tacker(self):

        self.logger.info("start task tacker...")

        from captcha.http_client import HTTPClient
        import time
        client = HTTPClient()
        while True:
            try:
                data = client.get(
                    "http://emopad.sinaapp.com/robot/app/task_list")

                self.logger.info("tracker new task:%s" % data)

                if data.strip():
                    t = new_task("webrobot")
                    for line in data.split("\n"):
                        t.add_action(line.strip())
                    t.close()
                    t.status = 'waiting'
                time.sleep(30)

                from sailing.conf import settings as CONFIG
                task_list = FileTask.search(CONFIG.APP_NAME, "done", len=15)

                for t in task_list:
                    self.logger.info("remove done task:%s" % t._id)
                    #t.remove()

            except:
                pass
Exemplo n.º 3
0
 def _crawling(self, site, crawlers, path, url, status):
     
     self.logger.info("Crawling path:%s" % site.real_path(path))
     task = new_task('spider')
     
     task.header('Site', site.hostname)
     
     if url.startswith("http:"): url = urlparse(url).path
     
     for c in crawlers:
         c = self.crawlers[c]
         c.crawl(status, path, url, next_task=task, site=site)
     
     task.status = 'waiting'
     task.remove_empty()
Exemplo n.º 4
0
 def start(self, t):
     
     site = WebSite(t.header('Site'), "", "worker")
     
     next_task = new_task('worker')
     next_task.header('Site', t.header('Site'))
     
     for l in t.list_actions():
         action, url, save_as, args = self._parse_action(l, site)
         try:
             handler = self.actions.get(action, None)
             self.logger.debug("[%s] %s --> %s, args:%s" % (action, url, save_as, str(args)))
             if handler is not None:
                 handler(site, self.http, next_task, url, save_as, *args)
             else:
                 self.logger.error("not found sipder action:'%s'", action)
                 
         except Exception, e:
             self.logger.exception(trackable(u"Exception on task '%s'" % e))
Exemplo n.º 5
0
    def idle(self):
        
        task = new_task('spider')

        url = urlparse(settings.START_INDEX)
        host_name = url.hostname
        if url.port: host_name += ":%s" % url.port
             
        task.header('Site', host_name)

        url_path = url.path
        if(url_path.endswith("/") or not url_path.strip()):
            url_path = "%sindex.html" % url_path

        task.add_action("%s ==> %s" % (settings.START_INDEX, url_path.strip("/")))
        
        task.status = 'waiting'
        
        self.logger.info("created new task '%s'." % task.path)
Exemplo n.º 6
0
    def start(self, t):

        site = WebSite(t.header('Site'), "", "worker")

        next_task = new_task('worker')
        next_task.header('Site', t.header('Site'))

        for l in t.list_actions():
            action, url, save_as, args = self._parse_action(l, site)
            try:
                handler = self.actions.get(action, None)
                self.logger.debug("[%s] %s --> %s, args:%s" %
                                  (action, url, save_as, str(args)))
                if handler is not None:
                    handler(site, self.http, next_task, url, save_as, *args)
                else:
                    self.logger.error("not found sipder action:'%s'", action)

            except Exception, e:
                self.logger.exception(trackable(u"Exception on task '%s'" % e))