def CPUStage(cls, data):
        content = data[0]
        url = data[1]
        print "%s get url : %s" % (cls.Name(), url)
        imgSet = cls.GetImgUrls(url, content, cls.picFmt)  #提取网页中图片的url
        maxUnhealImgCnt = len(imgSet) * float(
            cls.unhealthrate)  #根据设定的不良图片比例,计算出网页中不良图片的数量
        unHealthImgCnt = 0

        for imgurl in imgSet:
            imgtype, content = Common.GetContentByUrl(imgurl)  #抓取图片
            if content != None and cls.Parser.IsInvalidImg(
                    content, imgtype, imgurl):  #分析是否为不良图片
                unHealthImgCnt += 1
                if unHealthImgCnt >= maxUnhealImgCnt:  #网页中不良图片数量超过阈值maxUnhealImgCnt
                    picName = str(random.randint(0, 10000000)) + '.jpg'
                    master_LogCmd.WriteTaskData(
                        "!!!! found invalid html by %s: url: %s, picname:%s" %
                        (cls.Name(), url, picName))
                    os.popen('phantomjs snapshot.js %s %s' %
                             (url, picName))  #网页截图
                    break
Exemplo n.º 2
0
    def IOStage(cls, url):
        #在redis中记录当前抓取的线程id
        key = "%s_%d" % (cls.mac, threading.currentThread().ident)
        cls.FrameInfo.hset(key, url)

        # 抓取数据
        contenttype, content = Common.GetContentByUrl(url)

        #在redis更新抓取网页统计值
        urlHandleCnt = int(cls.FrameInfo.hget(cls.handleCnt)) + 1
        cls.FrameInfo.hset(cls.handleCnt, urlHandleCnt)
        cls.FrameInfo.hdel(key)

        #抓取异常判断
        if content == None or contenttype == None or content == '':
            master_LogCmd.WriteTaskData("Failed : %s" % url)
            return None

        #对于网页内容,则触发“基于文本分析不良网页的任务”
        if contenttype.find("text/html") != -1:  #是网页资源
            slaver_WordAnalysisCmd.WriteTaskData(content, url)

        return None