Пример #1
0
class Application(object):
    '''Application Class handlerData from Spider
    
    handle content:replace image url,statistics the number of image, merge article and cover
    
    Attributes:
        factors:dict,record data while calculating what kinds of articles we need
        images:loading from spider-for-images, using these images for covers of articles
        cotentImages:save images which regex from articles
        now:The program starts running time
    '''
    
    def __init__(self):
        self.factors = {}
        self.images = []
        self.contentImages = []
        self.now=datetime.datetime.now()
        self.log = Log(self.now)

    def run(self):
        nowStr=common.datetime_toStringYMDHMS(self.now)
        self.log.printInfo("program start,now:%s" %(nowStr))
        articles=self.loadData()
        if (articles!=None or len(articles)>0):
            processedArticles=self.handleArticles(articles)
            self.outputData(processedArticles)
            self.log.printInfo("processedArticles:%d, contentImages:%d, images:%d, factorsTime:%s" 
            %(len(processedArticles), len(self.contentImages), len(self.images), self.factors['time']))
        self.log.printInfo("program end,now:%s" %(nowStr))


    def loadData(self):
        '''load datas from mysql and redis
        
        load factors from last turn
        load articles from spider-for-PUA
        load images from spider-for-Images
    
        returns:
            qualifiedArticles:the program is going to handle these program
        '''
        self.log.printInfo("loadData start")
        self.factors=self.loadFactors()
        articles=self.loadArticles(self.factors['time'])
        qualifiedArticles,num=self.articlesFilter(articles, self.factors)
        self.images=self.loadImages(num)
        self.log.printInfo("loadData end,qualifiedArticles:%d" %(len(qualifiedArticles)))
        return qualifiedArticles
        
    def loadArticles(self, time):
        strTime = self.checkTime(time)
        service = MysqlService(self.log)
        articles = service.getArticles(strTime)
        return articles

    def checkTime(self, time):
        '''check time format
        
        args:
            time: time in factors

        returns:
            qualifiedArticles:the program is going to handle these program
        '''
        
        if isinstance(time, str):
            return time
        else:
            return config.defaultFactorsTime
        
    def loadFactors(self):
        service = RedisService(self.log)
        factors = service.getFactors()
        return factors
    
    def articlesFilter(self, articles, factors):
        '''filte useless articles
        
        args:
            time: time in factors

        returns:
            qualifiedArticles:the program is going to handle these program
        '''
        
        qualifiedArticles=articles
        return qualifiedArticles, len(qualifiedArticles)
        
    def loadImages(self, num):
        service = MysqlService(self.log)
        images = service.getImages(num)
        return images
    
    def handleArticles(self, articles):
        self.log.printInfo("handleArticles start")
        processedArticles=[]
        imgs=list(self.images)
        for article in articles:
            article = self.mergeCover(article, imgs)
            processedArticles.append(self.handleArticle(article))
        self.log.printInfo("handleArticles end")
        return processedArticles


    def mergeCover(self, article, imgs):
        if len(imgs)>0:
            article["image"]=imgs[0]["src"]
            imgs[0]["ifused"]=config.defaultIfused
            del imgs[0]
        else:
            article["image"]=config.defaultCoverImages
        return article
        
    def handleArticle(self, article):
        article['content'], article['imageNum'], article['sumImageNum']=self.handleImageInContent(article['content'], article['url'])
        article['writer']=self.getWriter(config.writer)
        article['brief']=self.getBrief(article['content'])
        self.log.printInfo("Article %s" %(str(article)))
        return article
    
    def getWriter(self, writer):
        return writer
        
    def getBrief(self, content):
        regexBrief = config.regexBrief
        brief=""
        groups = self.regexGroup(content, regexBrief)
        if groups!=None and len(groups)>0:
            if(len(groups[0])>config.briefLimit):
                brief = groups[0][0:config.briefLimit]
            else:
                brief = groups[0]
        return brief
    
    
    def handleImageInContent(self, content, source):
        '''handle Image Content
        
        main process include:
        a.get rid of '<a />' in content
        b.get rid of ads at the end of the article
        c. regex image in articles,record those images, then replace thoses urls by new urls
        
        args:
            content: article content
            source: article url
            
        returns:
            result: handled content
            numImg: the number of images analysis have been resolved
            sumImag:the number of images include the images can not be resolved and external link images
        '''
        
        result, number=self.regexSubn(content, config.regexA, config.replacementA)
        result, numImg, sumImg=self.regexMethodImage(result, config.regexImages, source)
        result, number=self.regexSubn(result, config.regexAds, config.replacementAds)
        result, numberSpecial=self.regexSubn(result, config.regexSpCharSingleQuotes, config.replacementSpCharSingleQuotes)
        return result, numImg, sumImg
    
    def handleImages(self, images):
        return []
        
    def regexSubn(self, content, regex, alternative):
        reobj = re.compile(regex)
        result, number = reobj.subn(alternative, content)
        return  result, number
        
    def regexGroup(self, content, regex):
        m = re.search(regex, content,re.M|re.I)
        if m!=None:
            return m.groups()
        else:
            return ()
        
        
    def regexMethodImage(self, content, regex, source):
        '''handle Content to collect <img …… /> ,result 
        
        args:
            content: article content
            regex: regex for <img …… />
            source: article url
            
        returns:
            result: handled content
            regexImagesCount: the number of images analysis have been resolved
            len(images):the number of images include the images can not be resolved and external link images
        '''
        
        reobj = re.compile(regex)
        images=reobj.findall(content)
        subnImages, regexImagesCount = self.generateConImgsAndSubnImgs(images, source)
        result = content
        for (name, newUrl) in subnImages.items():
            image_regex = r'<img.+%s.+?/>' % (name)
            result, num=self.regexSubn(result, image_regex, newUrl)
            #print "source:%s;num:%d" %(source,num)
        #print "images:%s,regex_images:%d" %(len(images),regexImagesCount)
        return  result, regexImagesCount, len(images)
        
    def generateConImgsAndSubnImgs(self, images, source):
        '''handle <img …… />
        
        args:
            images: list of <img …… />
            source: article url
            
        returns:
            subnImages: list of handled images, which its type is dict,key:image name;value:image <img ……/>
            regexImagesCount:the number of images which having been resolved
        '''
        
        subnImages={}
        numImg=0
        regexImagesCount=0
        for image in images:
            image, num=self.regexSubn(image, config.regexNoneGIf, config.replacementNoneGIf)
            groups=self.regexGroup(image, config.regexImagesUrl)
            if groups!=None and len(groups)>2:
                newUrl=config.defaultImagesSrc+groups[2]
                subnImage, numImg = self.regexSubn(image, config.regexImagesUrl, newUrl)
                subnImages[groups[2]]=subnImage
                contentImage = self.generateContentImage(groups, source)
                #handle images save in content
                self.contentImages.append(contentImage)
                regexImagesCount = regexImagesCount + 1
        return subnImages, regexImagesCount
    
    def generateContentImage(self, groups, source):
        ContentImage={}
        ContentImage["url"] = groups[0]
        ContentImage["type"] = config.contentImageType
        ContentImage["src"] = config.defaultImagesSrc+groups[2]
        ContentImage["sourceArticles"] =source
        ContentImage["ifused"] = config.defaultIfused
        ContentImage["ifdown"] = config.defaultIfdown
        ContentImage["createtime"] = self.now
        return ContentImage
        
        
    def outputData(self, articles):
        self.log.printInfo("outputData start")
        redisArticles=self.generateRedisArticles(articles)
        self.factors['time']=self.getLastestDate(articles)
        self.markUsingImages()
        self.outputByRedis(redisArticles)
        self.outputByMysql(articles)
        self.outputFactors()
        self.log.printInfo("outputData end")
        '''
        print "images: %s" %(str(self.images))
        print "contentImages: %s" %(str(self.contentImages))
        print "redisArticles:%s" %(str(redisArticles))
        '''

    def getLastestDate(self, articles):
        lastest = common.string_toDatetime(self.factors["time"])
        lastestDateTime = datetime.datetime(*lastest[0:6])
        lastestDate = common.date_toTimestamp(lastestDateTime)
        for article in articles:
            date=common.date_toTimestamp(article['editdate'])
            if (date > lastestDate):
                lastestDate = date
        lastestDatetime = common.timestamp_toDate(lastestDate)
        return lastestDatetime
        
    def markUsingImages(self):
        self.markCover(self.images)
        self.markContentImages(self.contentImages)
            
    def markCover(self, images):
        '''mark cover image in mysql, change ifused value, ifused=1
        
        '''
        try:
            service = MysqlService(self.log)
            service.updateImages(images)
            return True
        except Exception as e:
            self.log.printError("markCover error: '%s'" %(e)) 
            return False
    
    def markContentImages(self, images):
        '''save content image in mysql, ifused=1
            
        '''
        try:
            service = MysqlService(self.log)
            service.setImages(images)
            return True
        except Exception as e:
            self.log.printError("markContentImages error: '%s'" %(e)) 
            return False

    def generateRedisArticles(self, articles):
        redisArticles=[]
        for article in articles:
            redisArticle=self.generateRedisArticle(article)
            redisArticles.append(redisArticle)
        return redisArticles
            
    def generateRedisArticle(self, article):
        redisArticle={}
        redisArticle["id"]=article["id"]
        redisArticle["url"]=article["url"]
        redisArticle["editdate"]=article["editdate"]
        redisArticle["level"]=config.defaultLevel
        redisArticle["len"] = len(article["content"])
        redisArticle["ctr"] = config.defaultCtr
        redisArticle["pn"] = config.defaultPn
        redisArticle["pv"] = config.defaultPv
        redisArticle["imageNum"]=article["imageNum"]
        redisArticle["sumImage"]=article["sumImageNum"]
        return redisArticle
        
    def outputByRedis(self, articles):
        try:
            service = RedisService(self.log)
            return service.hsetArticles(articles)
        except Exception as e:
            self.log.printError("outputByRedis error: '%s'" %(e))
            return False  
        
    def outputByMysql(self, articles):
        try:
            service = MysqlService(self.log)
            service.setArticles(articles)
            return True
        except Exception as e:
            self.log.printError("outputByMysql error: '%s'" %(e))
            return False
    
    def outputFactors(self):
        try:
            service = RedisService(self.log)
            service.setFactors(self.factors)
            return True
        except Exception as e:
            self.log.printError("outputFactors error: '%s'" %(e))
            return False