def skinSubTwo(oldHtmlList, newsHtmlList): news = "" # html, number = re.subn('(?i)<head>.*?</head>','', html)#字符串替换 # html, number = re.subn('(?i)<style.*?</style>|<script.*?</script>|<!--[^<>]*<.*?-->|<div[^\u4e00-\u9fa5]</div>|<[/]{0,1}b|<[/]{0,1}strong>|<[/]{0,1}h\d>|<[/]{0,1}iframe[^>].*>>>','', html.replace('\n','').replace('\r',''))#字符串替换 # newsContentList=re.findall('(?i)<p>(.*?)</p>',html) # newsContentList+=re.findall('(?i)>(.*?)<br',html) # newsContentList=re.findall('<div.*?</div>',html) newslist = set() imglist = set() imgsrclist = [] for content in oldHtmlList: html = content.content for n in html.split("\n"): if n.find("<meta") != -1: continue if n.find("opyright") == -1 and n.find(u"联系我们") == -1: if ( n.find("<p>") != -1 or n.find("<P>") != -1 or n.find("<p ") != -1 or n.find("<P ") != -1 or n.find("</p>") != -1 or n.find("</P>") != -1 or n.find("<br><br>") != -1 or n.find("<BR><BR>") != -1 ): # if len(n.split('a')) n = n.strip() if n not in newslist: newslist.add(n) for img in re.findall("(?i)src=['\"]{0,1}([^>\s\?&]*\.jpg)", html.replace("\n", "").replace("\r", "")): imglist.add(img) memcache.set("img" + str(img), img, 360000) for content in newsHtmlList: html = content.content realContent = [] realimg = [] picList = [] # nextnews=[] # hasContent=False for n in html.split("\n"): if n.find("<meta") != -1: continue if n.find("opyright") == -1 and n.find(u"联系我们") == -1 and n.find(u"版权所有") == -1: if ( n.find("<p>") != -1 or n.find("<P>") != -1 or n.find("<p ") != -1 or n.find("<P ") != -1 or n.find("</p>") != -1 or n.find("</P>") != -1 or n.find("<br><br>") != -1 or n.find("<BR><BR>") != -1 ): # if len(n.split('a')) n = n.strip() if n not in newslist: newslist.add(n) # hasContent=True realContent.append(n + "[()]") # if hasContent and '下一页' in n: # for link,next in re.findall('(?i)<a\s+[^>]*href\s*=[\'\" ]*([^\s\"\']*)[\'\" ]*[^>]*>([^<]{3,6})</a>',html): # if '下一页' in next: # nextnews.append(link) news = "".join(realContent) # news=news.replace('<img ','[img ') news, number = re.subn("(?i)<style.*?</style>|<script.*?</script>", "", news) # 字符串替换 news, number = re.subn("(?i)</p>|<p[^>]*>|<br><br>", "[()]", news) # 字符串替换 news, number = re.subn("<[^>]*>", "", news) # 字符串替换 # news=content.title+'[()]'+news news = news.replace("[()][()]", "[()]") news = news.replace("[()][()]", "[()]") # news=news.replace('[img ','<img ') if len(realContent) > 0: start = html.find(content.title) if start < 0: start = 0 end = html.find(realContent[len(realContent) - len(realContent) / 4 - 1]) html = html[start:end] for img in re.findall("(?i)src=['\"]{0,1}([^>\s\?&]*\.jpg)", html.replace("\n", "").replace("\r", "")): hasimg = memcache.get("img" + str(img)) if hasimg: continue memcache.set("img" + str(img), img, 360000) if img not in imglist: imglist.add(img) if img.find("http") == -1: if img[0] == "/": root = re.findall("(?i)(http://[^/]*).*", content.link) imgsrc = root[0] + img img = '<img src="%s%s" />' % (root[0], img) else: root = re.findall("(?i)(http://.*/)[^/]+", content.link) imgsrc = root[0] + img img = '<img src="%s%s" />' % (root[0], img) else: imgsrc = img img = '<img src="%s" />' % img realimg.append(img) pic = Picture() pic.content = content pic.src = imgsrc picList.append(pic) memcache.set("piclist" + str(content.key().id()), {"list": picList}, 3600 * 24) if len(realimg) > 6: imgstr = "" else: imgstr = "".join(realimg) imgsrclist += picList content.realContent = news content.status = "3" # content.realContentResult=0 # if nextnews and len(nextnews)==1: # img=nextnews[0] # if img.find('http')==-1: # if img[0]=='/': # root=re.findall('(?i)(http://[^/]*).*',content.link) # imgsrc=root[0]+img # #img='<img src="%s%s" />'%(root[0],img) # else: # root=re.findall('(?i)(http://.*/)[^/]+',content.link) # imgsrc=root[0]+img # #img='<img src="%s%s" />'%(root[0],img) # else: # imgsrc=img # #img='<img src="%s" />'%img # content.next=imgsrc # nextContent=Contents() # nextContent.link=imgsrc # nextContent.hasFather=True db.put(newsHtmlList) db.put(imgsrclist)