def get(self): newsHtmlList = Contents.all().filter("status =", "2").fetch(1) if newsHtmlList: try: r = newsHtmlList[0].rss except: newsHtmlList[0].status = "1" # db.delete(newsHtmlList) logging.error("delete one news ,has no rss") return # ''' # 接下来就是要处理原始材料了。这是第一个版本的剥皮程序。 # 1.找寻所有的<p></p> 之间的内容。 # (根据我的观察,能发布新闻RSS的网站都是大型网站,有优化html代码的习惯。使得新闻html很简化。因此我觉得这个方法有一定的可行性。) # ''' newsHtmlList = Contents.all().filter("rss =", newsHtmlList[0].rss).filter("status =", "2").fetch(20) oldHtmlList = memcache.get("oldhtmllist" + str(newsHtmlList[0].rss.key().id())) if not oldHtmlList: oldHtmlList = Contents.all().filter("rss =", newsHtmlList[0].rss).filter("status >", "2").fetch(10) try: memcache.set("oldhtmllist" + str(newsHtmlList[0].rss.key().id()), oldHtmlList, 3600 * 24 * 3) except Exception, e: pass # content=newsHtmlList[0] # news=skinSubOne(content.content) skinSubTwo(oldHtmlList, newsHtmlList)
def get(self): cid = self.request.get("content") if cid: content = Contents.get_by_id(int(cid)) self.render("templates/look.html", {"content": content, "view": True}) return rssid = self.request.get("rss") if rssid: c = Contents.all().filter("realContentResult =", 0).filter("rss =", RSS.get_by_id(int(rssid))).fetch(1) if not rssid or not c: c = Contents.all().filter("realContentResult =", 0).fetch(1) if c: content = c[0] self.render("templates/look.html", {"content": content}) else: self.redirect("/")
def get(self): rssid = self.request.get("rss") c = ( Contents.all() .filter("rss =", RSS.get_by_id(int(rssid))) .filter("realContentResult >", 0) .filter("realContentResult !=", None) ) self.render("templates/detailLook.html", {"content": c})
def get(self): # nocode=Contents.all().filter('status =','1').filter('code =',None).fetch(10) # db.delete(nocode) nocontent = Contents.all().filter("status =", "1").filter("hasContent =", False).fetch(10) db.delete(nocontent) deletecontent = Contents.all().filter("status =", "1").filter("hasDelete =", True).fetch(10, 30) db.delete(deletecontent) # oldpic=Picture.all().filter('datetime <',datetime.now()+timedelta(hours=-72)).fetch(300) # db.delete(oldpic) # norss=Contents().all().filter('status =','2').fetch(100) rss = 0 # for c in norss: # try: # r=c.rss.code # except : # logging.info('delete 1') # c.status='1' # c.put() # rss+=1 logging.info("nocontent:" + str(len(nocontent)) + "-" + "deletecontent:" + str(len(deletecontent)))
def post(self): link = self.request.get("link") c = Contents.all().filter("link =", link).fetch(1) if c: content = c[0] content.realContentResult = int(self.request.get("realContentResult")) content.realContentBz = self.request.get("realContentBz") content.put() self.redirect("/look?rss=%s" % self.request.get("rss")) return self.redirect("/")
def get(self): rsslist = [] for r in RSS.all(): r.r0 = Contents.all().filter("rss =", r).filter("realContentResult =", 0).count() r.r1 = Contents.all().filter("rss =", r).filter("realContentResult =", 1).count() r.r2 = Contents.all().filter("rss =", r).filter("realContentResult =", 2).count() r.r3 = Contents.all().filter("rss =", r).filter("realContentResult =", 3).count() r.r4 = Contents.all().filter("rss =", r).filter("realContentResult =", 4).count() r.r5 = Contents.all().filter("rss =", r).filter("realContentResult =", 5).count() r.r6 = Contents.all().filter("rss =", r).filter("realContentResult =", None).count() rsslist.append(r) self.render("templates/analysis.html", {"RSSs": rsslist})
def get(self, limit): self.urls = [] for content in Contents.all().filter("status =", "1").fetch(int(limit)): self.urls.append((content, content.link)) self.searchRSS()