Пример #1
0
def skinSubTwo(oldHtmlList, newsHtmlList):
    news = ""
    #    html, number = re.subn('(?i)<head>.*?</head>','', html)#字符串替换
    #    html, number = re.subn('(?i)<style.*?</style>|<script.*?</script>|<!--[^<>]*<.*?-->|<div[^\u4e00-\u9fa5]</div>|<[/]{0,1}b|<[/]{0,1}strong>|<[/]{0,1}h\d>|<[/]{0,1}iframe[^>].*>>>','', html.replace('\n','').replace('\r',''))#字符串替换
    #    newsContentList=re.findall('(?i)<p>(.*?)</p>',html)
    #    newsContentList+=re.findall('(?i)>(.*?)<br',html)
    #    newsContentList=re.findall('<div.*?</div>',html)

    newslist = set()
    imglist = set()
    imgsrclist = []
    for content in oldHtmlList:
        html = content.content
        for n in html.split("\n"):
            if n.find("<meta") != -1:
                continue
            if n.find("opyright") == -1 and n.find(u"联系我们") == -1:
                if (
                    n.find("<p>") != -1
                    or n.find("<P>") != -1
                    or n.find("<p ") != -1
                    or n.find("<P ") != -1
                    or n.find("</p>") != -1
                    or n.find("</P>") != -1
                    or n.find("<br><br>") != -1
                    or n.find("<BR><BR>") != -1
                ):
                    #                 if len(n.split('a'))
                    n = n.strip()
                    if n not in newslist:
                        newslist.add(n)
        for img in re.findall("(?i)src=['\"]{0,1}([^>\s\?&]*\.jpg)", html.replace("\n", "").replace("\r", "")):
            imglist.add(img)
            memcache.set("img" + str(img), img, 360000)

    for content in newsHtmlList:
        html = content.content
        realContent = []
        realimg = []
        picList = []
        #        nextnews=[]
        #        hasContent=False
        for n in html.split("\n"):
            if n.find("<meta") != -1:
                continue
            if n.find("opyright") == -1 and n.find(u"联系我们") == -1 and n.find(u"版权所有") == -1:
                if (
                    n.find("<p>") != -1
                    or n.find("<P>") != -1
                    or n.find("<p ") != -1
                    or n.find("<P ") != -1
                    or n.find("</p>") != -1
                    or n.find("</P>") != -1
                    or n.find("<br><br>") != -1
                    or n.find("<BR><BR>") != -1
                ):
                    #                 if len(n.split('a'))
                    n = n.strip()
                    if n not in newslist:
                        newslist.add(n)
                        #                        hasContent=True
                        realContent.append(n + "[()]")
        #             if hasContent and '下一页' in n:
        #                 for link,next in re.findall('(?i)<a\s+[^>]*href\s*=[\'\" ]*([^\s\"\']*)[\'\" ]*[^>]*>([^<]{3,6})</a>',html):
        #                     if '下一页' in next:
        #                         nextnews.append(link)
        news = "".join(realContent)
        # news=news.replace('<img ','[img ')
        news, number = re.subn("(?i)<style.*?</style>|<script.*?</script>", "", news)  # 字符串替换
        news, number = re.subn("(?i)</p>|<p[^>]*>|<br><br>", "[()]", news)  # 字符串替换
        news, number = re.subn("<[^>]*>", "", news)  # 字符串替换
        #        news=content.title+'[()]'+news
        news = news.replace("[()][()]", "[()]")
        news = news.replace("[()][()]", "[()]")
        # news=news.replace('[img ','<img ')
        if len(realContent) > 0:
            start = html.find(content.title)
            if start < 0:
                start = 0
            end = html.find(realContent[len(realContent) - len(realContent) / 4 - 1])
            html = html[start:end]
        for img in re.findall("(?i)src=['\"]{0,1}([^>\s\?&]*\.jpg)", html.replace("\n", "").replace("\r", "")):
            hasimg = memcache.get("img" + str(img))
            if hasimg:
                continue
            memcache.set("img" + str(img), img, 360000)
            if img not in imglist:
                imglist.add(img)
                if img.find("http") == -1:
                    if img[0] == "/":
                        root = re.findall("(?i)(http://[^/]*).*", content.link)
                        imgsrc = root[0] + img
                        img = '<img src="%s%s" />' % (root[0], img)
                    else:
                        root = re.findall("(?i)(http://.*/)[^/]+", content.link)
                        imgsrc = root[0] + img
                        img = '<img src="%s%s" />' % (root[0], img)
                else:
                    imgsrc = img
                    img = '<img src="%s" />' % img
                realimg.append(img)
                pic = Picture()
                pic.content = content
                pic.src = imgsrc
                picList.append(pic)
        memcache.set("piclist" + str(content.key().id()), {"list": picList}, 3600 * 24)

        if len(realimg) > 6:
            imgstr = ""
        else:
            imgstr = "".join(realimg)
            imgsrclist += picList
        content.realContent = news
        content.status = "3"
    #        content.realContentResult=0
    #        if nextnews and len(nextnews)==1:
    #            img=nextnews[0]
    #            if img.find('http')==-1:
    #                if img[0]=='/':
    #                    root=re.findall('(?i)(http://[^/]*).*',content.link)
    #                    imgsrc=root[0]+img
    #                    #img='<img src="%s%s" />'%(root[0],img)
    #                else:
    #                    root=re.findall('(?i)(http://.*/)[^/]+',content.link)
    #                    imgsrc=root[0]+img
    #                    #img='<img src="%s%s" />'%(root[0],img)
    #            else:
    #                imgsrc=img
    #                #img='<img src="%s" />'%img
    #            content.next=imgsrc
    #            nextContent=Contents()
    #            nextContent.link=imgsrc
    #            nextContent.hasFather=True

    db.put(newsHtmlList)
    db.put(imgsrclist)