Пример #1
0
    def scheduleUrl(self):
        """
        run_type为1,脚本需定义run_url方法
        """
        DEBUG("scheduleUrl start")
        sql = "SELECT `rule_id`,`risk`,`file_name` FROM `%s` WHERE `run_type` = 1 ORDER BY  `priority`" % RULE_TABLE
        # rules = []
        # for rule in db.iter(sql):
        #     rules.append((str(rule.rule_id), rule.file_name, rule.risk))
        rules = [(str(rule.rule_id), rule.file_name, rule.risk) for rule in db.iter(sql) if str(rule.rule_id) not in self.finished_progress]

        if not conf.spider_finish: #spider not finished, start crawler
            CrawlEngine.start()

        sql = "SELECT `url`,`method`,`params`,`referer` FROM %s WHERE `task_id`=%s" % (URL_TABLE, self.task_id)
        # reqs = []
        # for url in db.iter(sql):
        #     reqs.append(Url(url.url, url.method, url.params, url.referer))
        reqs = [Url(url.url, url.method, url.params, url.referer) for url in db.iter(sql)]

        for rule_id, filename, risk in rules:
            run_url = attr_from_script(filename, RUN_URL_DEFAULT_FUN)
            if run_url:
                DEBUG("rule_id:%s filename:%s run_url start" % (rule_id, filename))
                for req in reqs:
                    self.pool.spawn(self.runUrl, rule_id, run_url, req, filename, risk)
                    gevent.sleep(0)
                DEBUG("rule_id:%s filename:%s run_url end" % (rule_id, filename))
        DEBUG("scheduleUrl end")
Пример #2
0
 def get_exist_url(self):
     sql = "SELECT * FROM url WHERE task_id=%s"
     urls = None
     try:
         urls = [ url for url in db.iter(sql, self.task_id)]
     except Exception:
         ERROR('get_url exception,task_id:%s' % self.task_id)
     return urls or []
Пример #3
0
 def get_exist_url(self):
     sql = "SELECT * FROM url WHERE task_id=%s"
     urls = None
     try:
         urls = [url for url in db.iter(sql, self.task_id)]
     except Exception:
         ERROR('get_url exception,task_id:%s' % self.task_id)
     return urls or []
Пример #4
0
    def scheduleUrl(self):
        """
        run_type为1,脚本需定义run_url方法
        """
        DEBUG("scheduleUrl start")
        sql = "SELECT `rule_id`,`risk`,`file_name` FROM `%s` WHERE `run_type` = 1 ORDER BY  `priority`" % RULE_TABLE
        # rules = []
        # for rule in db.iter(sql):
        #     rules.append((str(rule.rule_id), rule.file_name, rule.risk))
        rules = [(str(rule.rule_id), rule.file_name, rule.risk)
                 for rule in db.iter(sql)
                 if str(rule.rule_id) not in self.finished_progress]

        if not conf.spider_finish:  #spider not finished, start crawler
            CrawlEngine.start()

        sql = "SELECT `url`,`method`,`params`,`referer` FROM %s WHERE `task_id`=%s" % (
            URL_TABLE, self.task_id)
        # reqs = []
        # for url in db.iter(sql):
        #     reqs.append(Url(url.url, url.method, url.params, url.referer))
        reqs = [
            Url(url.url, url.method, url.params, url.referer)
            for url in db.iter(sql)
        ]

        for rule_id, filename, risk in rules:
            run_url = attr_from_script(filename, RUN_URL_DEFAULT_FUN)
            if run_url:
                DEBUG("rule_id:%s filename:%s run_url start" %
                      (rule_id, filename))
                for req in reqs:
                    self.pool.spawn(self.runUrl, rule_id, run_url, req,
                                    filename, risk)
                    gevent.sleep(0)
                DEBUG("rule_id:%s filename:%s run_url end" %
                      (rule_id, filename))
        DEBUG("scheduleUrl end")
Пример #5
0
 def scheduleDomain(self):
     """
     run_type为2,脚本需定义run_domain方法
     """
     DEBUG("scheduleDomain start")
     sql = "SELECT `rule_id`,`risk`,`file_name` FROM `%s` WHERE `run_type` = 2 ORDER BY  `priority`" % RULE_TABLE
     # domainRule = []
     # for rule in db.iter(sql):
     #     domainRule.append((str(rule.rule_id), rule.file_name, rule.risk))
     domainRule = [ (str(rule.rule_id), rule.file_name, rule.risk) for rule in db.iter(sql) if str(rule.rule_id) not in self.finished_progress]
     for rule_id, filename, risk in domainRule:
         run_domain = attr_from_script(filename, RUN_DOMAIN_DEFAULT_FUN)
         if run_domain:
             DEBUG("rule_id:%s filename:%s run_domain start" % (rule_id, filename))
             self.pool.spawn(self.runDomain, rule_id, run_domain, filename, risk)
             gevent.sleep(0)
             DEBUG("rule_id:%s filename:%s run_domain end" % (rule_id, filename))
     DEBUG("scheduleDomain end")
Пример #6
0
 def scheduleDomain(self):
     """
     run_type为2,脚本需定义run_domain方法
     """
     DEBUG("scheduleDomain start")
     sql = "SELECT `rule_id`,`risk`,`file_name` FROM `%s` WHERE `run_type` = 2 ORDER BY  `priority`" % RULE_TABLE
     # domainRule = []
     # for rule in db.iter(sql):
     #     domainRule.append((str(rule.rule_id), rule.file_name, rule.risk))
     domainRule = [(str(rule.rule_id), rule.file_name, rule.risk)
                   for rule in db.iter(sql)
                   if str(rule.rule_id) not in self.finished_progress]
     for rule_id, filename, risk in domainRule:
         run_domain = attr_from_script(filename, RUN_DOMAIN_DEFAULT_FUN)
         if run_domain:
             DEBUG("rule_id:%s filename:%s run_domain start" %
                   (rule_id, filename))
             self.pool.spawn(self.runDomain, rule_id, run_domain, filename,
                             risk)
             gevent.sleep(0)
             DEBUG("rule_id:%s filename:%s run_domain end" %
                   (rule_id, filename))
     DEBUG("scheduleDomain end")