def updateFeed(feedUrl, now, cutoff): print 'parsing ', feedUrl parser = feedparser.parse(feedUrl)#, agent='Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)') print 'status ', str(parser.status) # if parser.status == 500: # print news.escape_xml(parser.data) feedid = "feed/" + filters.encode_segment(feedUrl) for entry in parser.entries: link = entry.get('link', '') if not link: continue; artid = "art/" + filters.encode_segment(link) if feeds.redis.exists(artid): print 'skipping', link continue; print 'saving', link art = {} art['name'] = entry.get('title', '') art['guid'] = entry.get('guid', '') art['date'] = now if entry.has_key('published_parsed') and entry.published_parsed: art['date'] = calendar.timegm(entry.published_parsed) elif entry.has_key('date_parsed') and entry.date_parsed: art['date'] = calendar.timegm(entry.date_parsed) art['category'] = entry.get('category', '') feeds.redis.hmset(artid, art) feeds.redis.zadd(feedid, art['date'], artid) print 'purging ', feedUrl for artid in feeds.redis.zrangebyscore(feedid, "-inf", cutoff): feeds.redis.delete(artid) feeds.redis.zremrangebyscore(feedid, "-inf", cutoff)
def update_user(userid): logging.info('updating articles for user %s' , userid) now = time.time() cutoff = now - (60 * 24 * 60 * 60) update = redis.hget(userid, 'update') for subid in redis.smembers(userid + "/subs"): feedUrl = redis.hget(subid, "feedUrl") if feedUrl: # update the feed's access time redis.zadd('feeds', now, feedUrl) # get the new article IDs args = [] feedid = "feed/" + filters.encode_segment(feedUrl) for artid, score in redis.zrange(feedid, 0, -1, None, True): if not redis.zscore(subid + "/read", artid): args.append(score) args.append(artid) # copy the new article IDs to the unread zset if args: redis.zadd(subid + "/unread", *args) # purge articles older then 60 days redis.zremrangebyscore(subid + "/unread", "-inf", cutoff) redis.zremrangebyscore(subid + "/read", "-inf", cutoff) # save now as the last update time redis.hset(userid, 'update', now)
def _create_user(self, environ, start_response, request, session): action = request.headers.get('X-Forwarded-Path', request.path) username = request.params.get("idem.username", "") password1 = request.params.get("idem.password1", "") password2 = request.params.get("idem.password2", "") message = "" if request.params["idem.create"] != "": pass # show blank form elif len(username) < 4: message = "Usernames must be at least 4 letters." elif len(password1) < 6: message = "Passwords must be at least 6 letters." elif password1 != password2: message = "Passwords do not match." else: userid = "user/" + filters.encode_segment(username) if self.redis.hsetnx(userid, "password", ctx.encrypt(password1)): session[self.user_key] = userid response = request.get_response(webob.exc.HTTPFound(location=action)) return response(environ, start_response) message = "Username already exists." response = webob.Response() response.charset = "UTF-8" response.text = unicode(_form_create.substitute( action=cgi.escape(action, True), message=cgi.escape(message, True), username=cgi.escape(username, True), password1=cgi.escape(password1, True), password2=cgi.escape(password2, True) )) return response(environ, start_response)
def __call__(self, environ, start_response): message = "" session = environ.get(self.session_key) userid = session.get(self.user_key) if environ["PATH_INFO"] == self.auth_logout: session[self.user_key] = None message = "You have signed out." elif userid: environ["REMOTE_USER"] = userid if environ["PATH_INFO"] == self.auth_change_password: return self._change_password(environ, start_response) else: return self.application(environ, start_response) request = webob.Request(environ) action = request.headers.get('X-Forwarded-Path', request.path) username = request.params.get("idem.username", "") password = request.params.get("idem.password", "") if environ["PATH_INFO"] == self.auth_logout: action = "./" if request.method == "POST": if request.params.has_key("idem.create"): return self._create_user(environ, start_response, request, session) else: userid = "user/" + filters.encode_segment(username) passwd = self.redis.hget(userid, "password") if passwd: if ctx.verify(password, passwd): session[self.user_key] = userid response = request.get_response(webob.exc.HTTPFound(location=action)) return response(environ, start_response) else: ctx.verify(password, self.dummy) message = "The username or password you entered is incorrect." response = webob.Response() response.charset = "UTF-8" response.text = unicode(_form_login.substitute( action=cgi.escape(action, True), message=cgi.escape(message, True), username=cgi.escape(username, True), password=cgi.escape(password, True) )) return response(environ, start_response)
def action(context): feeds.update_user_maybe(context["user"]) # LATER: if empty redirect to welcome action = None subidLast = None artidLast = None if context["request"].method == "POST": for name in context["parameters"].keys(): if ( name.startswith("hide:") or name.startswith("next:") or name.startswith("show:") or name.startswith("skip:") ): parts = name.split(":") action = parts[0] subidLast = parts[1] artidLast = parts[2] break if action == "hide" or action == "next": scoreLast = feeds.redis.zscore(subidLast + "/unread", artidLast) if scoreLast: feeds.redis.zrem(subidLast + "/unread", artidLast) feeds.redis.zadd(subidLast + "/read", scoreLast, artidLast) elif action == "show": scoreLast = feeds.redis.zscore(subidLast + "/read", artidLast) if scoreLast: feeds.redis.zrem(subidLast + "/read", artidLast) feeds.redis.zadd(subidLast + "/unread", scoreLast, artidLast) feedFilter = context["parameters"].get("feed") showFilter = context["parameters"].get("show") ids = [] feeders = [] now = time.time() cutoff = now - (60 * 24 * 60 * 60) for subid in feeds.redis.sort(context["user"] + "/subs", None, None, "*->feedName", None, False, True): feedName = feeds.redis.hget(subid, "feedName") count = 0 if feedFilter and feedFilter != subid: count = feeds.redis.zcard(subid + "/unread") else: before = len(ids) ids.extend( [ (subid, feedName, artid, True, score) for artid, score in feeds.redis.zrangebyscore(subid + "/unread", "-inf", "+inf", None, None, True) ] ) count = len(ids) - before feeders.append({"subid": subid, "feedName": feedName, "counter": count}) feeds.redis.zremrangebyscore(subid + "/read", "-inf", cutoff) if showFilter == "all" and (feedFilter == subid or not feedFilter): ids.extend( [ (subid, feedName, artid, False, score) for artid, score in feeds.redis.zrangebyscore(subid + "/read", "-inf", "+inf", None, None, True) ] ) # sort by date descending ids.sort(filters.compare, lambda x: -float(x[4])) qs = "" if feedFilter: qs = "?feed=" + urllib.quote_plus(feedFilter) if showFilter: qs += "&" if qs else "?" qs += "show=" + urllib.quote_plus(showFilter) if context["parameters"].get("prefetch"): qs += "&" if qs else "?" qs += "prefetch=1" if action == "next" or action == "skip": scoreLast = float(context["parameters"].get("date", 0)) subidNext = None artidNext = None for tup in ids: if tup[4] <= scoreLast: break subidNext = tup[0] artidNext = tup[2] if subidNext and artidNext: path = ( context["root"] + "/feed/" + filters.encode_segment(subidNext) + "/read/" + filters.encode_segment(artidNext) + "/" + qs ) return context["request"].get_response(webob.exc.HTTPFound(location=path)) offset = 0 try: offset = int(context["parameters"].get("offset", 0)) except: pass if offset > 0: context["newer"] = qs + ("&" if qs else "?") + "offset=" + str(offset - 50) if len(ids) - offset > 50: context["older"] = qs + ("&" if qs else "?") + "offset=" + str(offset + 50) oldest = len(ids) - 50 if oldest > 0 and oldest - offset > 50: context["oldest"] = qs + ("&" if qs else "?") + "offset=" + str(oldest) articles = [] for tup in ids[offset : offset + 50]: art = feeds.redis.hgetall(tup[2]) art["subid"] = tup[0] art["feedName"] = tup[1] art["artid"] = tup[2] art["unread"] = tup[3] art["articleDate"] = str(datetime.datetime.utcfromtimestamp(float(art["date"]))) feeds.makeUnicode(art) articles.append(art) context["feeds"] = feeders context["articles"] = articles context["qs"] = qs
def get_article_content(articleUrl, articleGuid, sub, lstLog=None): result = None # sys.stderr.write(str(articleUrl) + '\n') # sys.stderr.flush() url = articleUrl # optionally modify URL before fetching the article if sub and articleGuid and sub['useGuid'] == '1': url = articleGuid url = adjust_url(url, sub) # use cached copy if present key = url if sub and sub['xpath']: key = key + ' ' + sub['xpath'] key = "page/" + filters.encode_segment(key) if not lstLog: result = redis.get(key) if result: return result.decode('utf-8') raw = None try: if lstLog: lstLog.append('fetching url: ') lstLog.append(url) lstLog.append('\n') # fetch the article before = time.clock() jar = cookielib.CookieJar() proc = urllib2.HTTPCookieProcessor(jar) redir = LoggingHTTPRedirectHandler(sub, lstLog) opener = urllib2.build_opener(proc, redir) # opener.addheaders.append(('Accept', '*/*')) # f = opener.open(url) req = make_request(url) f = opener.open(req) raw = f.read() base = f.geturl() mime, params = cgi.parse_header(f.info().getheader('Content-Type')) encoding = params.get('charset')#, 'ISO-8859-1') f.close() if lstLog: lstLog.append(str(len(raw))) lstLog.append(' bytes retrieved in ') lstLog.append(str(time.clock() - before)) lstLog.append(' seconds, encoding ') lstLog.append(str(encoding)) lstLog.append('\n') # tag soup parse the article before = time.clock() src = BeautifulSoup.BeautifulSoup(raw, "html5lib", from_encoding=encoding) if lstLog: lstLog.append('parse ') lstLog.append(str(time.clock() - before)) lstLog.append(' seconds\n') # sanitize the article markup - remove script, style, and more # also convert to xml.dom.minidom so we can use xpath before = time.clock() doc = soup2dom(src) if lstLog: lstLog.append('sanitize ') lstLog.append(str(time.clock() - before)) lstLog.append(' seconds\n') # extract the parts we want before = time.clock() parts = [] if sub and sub['xpath']: for path in sub['xpath'].split('\n'): parts.extend(xpath.find(path, doc)) else: parts.append(doc.documentElement) if lstLog: lstLog.append('xpath ') lstLog.append(str(time.clock() - before)) lstLog.append(' seconds\n') lstLog.append('xpath ') lstLog.append(str(len(parts))) lstLog.append(' parts\n') # remove class and id attributes so they won't conflict with ours # this makes the content smaller too # we do this after xpath so xpath can use class and id before = time.clock() for tag in doc.getElementsByTagName('*'): if tag.hasAttribute('class'): tag.removeAttribute('class') if tag.hasAttribute('id'): tag.removeAttribute('id') if tag.nodeName == 'a' and tag.hasAttribute('href'): tag.setAttribute('target', '_blank') if lstLog: lstLog.append('clean ') lstLog.append(str(time.clock() - before)) lstLog.append(' seconds\n') # make relative URLs absolute so they work in our site before = time.clock() cache = {} for part in parts: for attr in [ 'action', 'background', 'cite', 'classid', 'codebase', 'data', 'href', 'longdesc', 'profile', 'src', 'usemap' ]: for tag in xpath.find('.//*[@' + attr + ']', part): value = tag.getAttribute(attr) absolute = urlparse.urljoin(base, value) tag.setAttribute(attr, absolute) if lstLog: lstLog.append('make urls absolute ') lstLog.append(str(time.clock() - before)) lstLog.append(' seconds\n') # convert to string before = time.clock() result = u'' for part in parts: result += u'<div>' if part.nodeType == 2: result += part.nodeValue else: result += part.toxml('utf-8').decode('utf-8') result += u'</div>' if lstLog: lstLog.append('to string ') lstLog.append(str(time.clock() - before)) lstLog.append(' seconds\n') if lstLog: lstLog.append('article size: ') lstLog.append(filters.format_IEEE1541(len(result))) lstLog.append('\n') redis.setex(key, 20*60, result) if lstLog and len(result) == 0: result += '<pre>\n' result += escape('\n'.join(lstLog)) result += '\n</pre>' except Exception, err: logging.error("%s", pprint.pformat(err)) text = str(err) if lstLog: lstLog.append('exception:\n') lstLog.append(text) lstLog.append('\nstack:\n') lstLog.append(traceback.format_exc()) lstLog.append('source:\n') lstLog.append(repr(raw)) if hasattr(err, 'read'): lstLog.append('\nbody:\n') lstLog.append(err.read()) lstLog.append('\n') if result: result += '\n' else: result = '' result += '<pre>\n' result += escape(str(url)) result += '\n\n' result += escape(text) result += '\n</pre>\n<!--\n' result += escape(traceback.format_exc()) result += '\n-->'
def update_user_maybe(userid): key = "update/" + filters.encode_segment(userid) if True or not redis.exists(key): update_user(userid) redis.setex(key, 600, "1")