예제 #1
0
    def __init__(self, url):
        spider_path = pf.getProfileValue('spider', 'path')
        spider_setting_path = pf.getProfileValue('spider_setting', 'path')
        sys.path.append(spider_path)        # 将sinbot模块地址导入
        sys.path.append(spider_setting_path)# 将sinbot_settings模块的地址导入

        self.url = url                      # 用来保存当前检测的主页面的地址
        # self.rootPath = os.path.dirname(os.path.realpath(__file__)) # 用来保存当前检测的位置
        self.resultHiddenlink = {}          # 用来保存最终的检测结果
        self.urlList = []                   # 传递进来需要进行检测的URL列表
        self.curNum = 0                     # 统计当前检测的是第几条
        self.detectTM = ThreadManager()     # 线程管理
예제 #2
0
class hiddenlink_obj():
    def __init__(self, url):
        spider_path = pf.getProfileValue('spider', 'path')
        spider_setting_path = pf.getProfileValue('spider_setting', 'path')
        sys.path.append(spider_path)        # 将sinbot模块地址导入
        sys.path.append(spider_setting_path)# 将sinbot_settings模块的地址导入

        self.url = url                      # 用来保存当前检测的主页面的地址
        # self.rootPath = os.path.dirname(os.path.realpath(__file__)) # 用来保存当前检测的位置
        self.resultHiddenlink = {}          # 用来保存最终的检测结果
        self.urlList = []                   # 传递进来需要进行检测的URL列表
        self.curNum = 0                     # 统计当前检测的是第几条
        self.detectTM = ThreadManager()     # 线程管理



    def init(self):

        def get_url(list):
            '''
            描述: 将爬虫获取到的request列表中的url提取出来,并且格式化与去重复
            :param list:
            :return:
            '''
            tempList = []
            for item in list:
                url = item.url
                if url and url[-1] == '/':
                    url = url[:-1]
                tempList.append(url)
            return set(tempList)

        self.detectTM.setMaxThreads(10)     # 设置可以同时进行任务的个数


        from sinbot import sinbot_start     # 引入sinbot_start方法
        from settings.settings import settings as st # 引入sinbot_settings方法
        st.set('DEPTH_LIMIT', settings.getint('DEPTH_LIMIT'))    # 设置检测层数, 此处设置为2表示3层,从0开始计数
        reqList = sinbot_start(self.url)      # 开始爬取结果
        self.urlList = get_url(reqList)    # 将爬取到的url结果保存到列表中
        logger.info('Detect modules complete initialization...')

    def oneTask(self, url):
        self.curNum += 1        # 每执行一个任务,则将当前的任务数目+1
        logger.info('One detect task is running(%d/%d), detect url is : %s' % (self.curNum, len(self.urlList), url))
        starttime = time.time()
        hdDetect = Detect(url)
        hdDetect.init_detect()
        hdDetect.evil_detect()
        hdDetect.print_hiddenlink_result()
        if len(hdDetect.hiddenSet):
            self.resultHiddenlink[url] = hdDetect.hiddenSet
        endtime = time.time()
        logger.info('One detect task finished! Using %f seconds!' % (endtime-starttime))

    def run(self):
        # 0. 设置检测的开始时间
        startTime = time.time()
        temp = time.localtime(startTime)
        self.strStartTime= time.strftime('%Y-%m-%d %H:%M:%S',temp)

        for url in self.urlList:
            url = url.strip('\n')      # 格式化传入的url,存在\n会导致产生浏览器访问失败
            if url is not None:
                args = (url, )
                self.detectTM.startTask(self.oneTask, args)
            else:
                logger.error('No url need to detect, please check it!')

        self.detectTM.join()
        # 2. 设置检测结束的时间
        endTime = time.time()
        self.interval = human_time(endTime - startTime)         # 设置检测用时

        # 3. 生成检测报告
        logger.info('Detect running success! Now will make the detect report file!')
        html_report = HtmlFile(self)
        report_path = None
        try:
            report_path = html_report.genHtmlReport()
        except Exception, msg:
            logger.error('Make detect report file failed! Exception: %s.' % msg)

        logger.info('Store detect report success!')

        # 4. 将检测结果写入数据库
        threat_name = settings.get('THREAT_NAME')
        threat_sum = len(self.resultHiddenlink)
        threat_level = settings.get('THREAT_LEVEL')

        if report_path is None:
            logger.error('HTML maker get wrong report path! Please check it!')
            report_part_path = None
        else:
            path_list = report_path.split('/')
            report_part_path = path_list[-2] + '/' + path_list[-1]
        stat_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        if threat_sum != 0:
            id = PKgenerator.getPrimaryKeyId()
            try:
                ref_id = get_id_from_monitor_sites_by_url(self.url)
            except DarkException, msg:
                logger.error(msg)
            else:
                try:
                    store_url_hidden_report_in_monitor_statistic(id, ref_id, threat_name, threat_level, threat_sum ,stat_time, report_part_path)
                except DarkException, msg:
                    logger.error(msg)