Пример #1
0
def updateFeed(feedUrl, now, cutoff):
    print 'parsing ', feedUrl
    parser = feedparser.parse(feedUrl)#, agent='Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)')
    print 'status ', str(parser.status)
#        if parser.status == 500:
#            print news.escape_xml(parser.data)

    feedid = "feed/" + filters.encode_segment(feedUrl)
    for entry in parser.entries:
        link = entry.get('link', '')

        if not link:
            continue;
        artid = "art/" + filters.encode_segment(link)
        if feeds.redis.exists(artid):
            print 'skipping', link
            continue;

        print 'saving', link
        art = {}
        art['name'] = entry.get('title', '')
        art['guid'] = entry.get('guid', '')
        art['date'] = now
        if entry.has_key('published_parsed') and entry.published_parsed:
            art['date'] = calendar.timegm(entry.published_parsed)
        elif entry.has_key('date_parsed') and entry.date_parsed:
            art['date'] = calendar.timegm(entry.date_parsed)
        art['category'] = entry.get('category', '')
        feeds.redis.hmset(artid, art)
        feeds.redis.zadd(feedid, art['date'], artid)

    print 'purging ', feedUrl
    for artid in feeds.redis.zrangebyscore(feedid, "-inf", cutoff):
        feeds.redis.delete(artid)
    feeds.redis.zremrangebyscore(feedid, "-inf", cutoff)
Пример #2
0
def update_user(userid):
    logging.info('updating articles for user %s' , userid)
    now = time.time()
    cutoff = now - (60 * 24 * 60 * 60)
    update = redis.hget(userid, 'update')
    for subid in redis.smembers(userid + "/subs"):
        feedUrl = redis.hget(subid, "feedUrl")
        if feedUrl:
            # update the feed's access time
            redis.zadd('feeds', now, feedUrl)
            # get the new article IDs
            args = []
            feedid = "feed/" + filters.encode_segment(feedUrl)
            for artid, score in redis.zrange(feedid, 0, -1, None, True):
                if not redis.zscore(subid + "/read", artid):
                    args.append(score)
                    args.append(artid)
            # copy the new article IDs to the unread zset
            if args:
                redis.zadd(subid + "/unread", *args)
        # purge articles older then 60 days
        redis.zremrangebyscore(subid + "/unread", "-inf", cutoff)
        redis.zremrangebyscore(subid + "/read", "-inf", cutoff)
    # save now as the last update time
    redis.hset(userid, 'update', now)
Пример #3
0
 def _create_user(self, environ, start_response, request, session):
     action = request.headers.get('X-Forwarded-Path', request.path)
     username = request.params.get("idem.username", "")
     password1 = request.params.get("idem.password1", "")
     password2 = request.params.get("idem.password2", "")
     message = ""
     if request.params["idem.create"] != "":
         pass # show blank form
     elif len(username) < 4:
         message = "Usernames must be at least 4 letters."
     elif len(password1) < 6:
         message = "Passwords must be at least 6 letters."
     elif password1 != password2:
         message = "Passwords do not match."
     else:
         userid = "user/" + filters.encode_segment(username)
         if self.redis.hsetnx(userid, "password", ctx.encrypt(password1)):
             session[self.user_key] = userid
             response = request.get_response(webob.exc.HTTPFound(location=action))
             return response(environ, start_response)
         message = "Username already exists."
     response = webob.Response()
     response.charset = "UTF-8"
     response.text = unicode(_form_create.substitute(
         action=cgi.escape(action, True),
         message=cgi.escape(message, True),
         username=cgi.escape(username, True),
         password1=cgi.escape(password1, True),
         password2=cgi.escape(password2, True)
     ))
     return response(environ, start_response)
Пример #4
0
 def __call__(self, environ, start_response):
     message = ""
     session = environ.get(self.session_key)
     userid = session.get(self.user_key)
     if environ["PATH_INFO"] == self.auth_logout:
         session[self.user_key] = None
         message = "You have signed out."
     elif userid:
         environ["REMOTE_USER"] = userid
         if environ["PATH_INFO"] == self.auth_change_password:
             return self._change_password(environ, start_response)
         else:
             return self.application(environ, start_response)
     request = webob.Request(environ)
     action = request.headers.get('X-Forwarded-Path', request.path)
     username = request.params.get("idem.username", "")
     password = request.params.get("idem.password", "")
     if environ["PATH_INFO"] == self.auth_logout:
         action = "./"
     if request.method == "POST":
         if request.params.has_key("idem.create"):
             return self._create_user(environ, start_response, request, session)
         else:
             userid = "user/" + filters.encode_segment(username)
             passwd = self.redis.hget(userid, "password")
             if passwd:
                 if ctx.verify(password, passwd):
                     session[self.user_key] = userid
                     response = request.get_response(webob.exc.HTTPFound(location=action))
                     return response(environ, start_response)
             else:
                 ctx.verify(password, self.dummy)
             message = "The username or password you entered is incorrect."
     response = webob.Response()
     response.charset = "UTF-8"
     response.text = unicode(_form_login.substitute(
         action=cgi.escape(action, True),
         message=cgi.escape(message, True),
         username=cgi.escape(username, True),
         password=cgi.escape(password, True)
     ))
     return response(environ, start_response)
Пример #5
0
def action(context):
    feeds.update_user_maybe(context["user"])

    # LATER: if empty redirect to welcome

    action = None
    subidLast = None
    artidLast = None
    if context["request"].method == "POST":
        for name in context["parameters"].keys():
            if (
                name.startswith("hide:")
                or name.startswith("next:")
                or name.startswith("show:")
                or name.startswith("skip:")
            ):
                parts = name.split(":")
                action = parts[0]
                subidLast = parts[1]
                artidLast = parts[2]
                break

    if action == "hide" or action == "next":
        scoreLast = feeds.redis.zscore(subidLast + "/unread", artidLast)
        if scoreLast:
            feeds.redis.zrem(subidLast + "/unread", artidLast)
            feeds.redis.zadd(subidLast + "/read", scoreLast, artidLast)
    elif action == "show":
        scoreLast = feeds.redis.zscore(subidLast + "/read", artidLast)
        if scoreLast:
            feeds.redis.zrem(subidLast + "/read", artidLast)
            feeds.redis.zadd(subidLast + "/unread", scoreLast, artidLast)

    feedFilter = context["parameters"].get("feed")
    showFilter = context["parameters"].get("show")

    ids = []
    feeders = []
    now = time.time()
    cutoff = now - (60 * 24 * 60 * 60)
    for subid in feeds.redis.sort(context["user"] + "/subs", None, None, "*->feedName", None, False, True):
        feedName = feeds.redis.hget(subid, "feedName")
        count = 0
        if feedFilter and feedFilter != subid:
            count = feeds.redis.zcard(subid + "/unread")
        else:
            before = len(ids)
            ids.extend(
                [
                    (subid, feedName, artid, True, score)
                    for artid, score in feeds.redis.zrangebyscore(subid + "/unread", "-inf", "+inf", None, None, True)
                ]
            )
            count = len(ids) - before
        feeders.append({"subid": subid, "feedName": feedName, "counter": count})
        feeds.redis.zremrangebyscore(subid + "/read", "-inf", cutoff)
        if showFilter == "all" and (feedFilter == subid or not feedFilter):
            ids.extend(
                [
                    (subid, feedName, artid, False, score)
                    for artid, score in feeds.redis.zrangebyscore(subid + "/read", "-inf", "+inf", None, None, True)
                ]
            )

    # sort by date descending
    ids.sort(filters.compare, lambda x: -float(x[4]))

    qs = ""
    if feedFilter:
        qs = "?feed=" + urllib.quote_plus(feedFilter)
    if showFilter:
        qs += "&" if qs else "?"
        qs += "show=" + urllib.quote_plus(showFilter)
    if context["parameters"].get("prefetch"):
        qs += "&" if qs else "?"
        qs += "prefetch=1"

    if action == "next" or action == "skip":
        scoreLast = float(context["parameters"].get("date", 0))
        subidNext = None
        artidNext = None
        for tup in ids:
            if tup[4] <= scoreLast:
                break
            subidNext = tup[0]
            artidNext = tup[2]
        if subidNext and artidNext:
            path = (
                context["root"]
                + "/feed/"
                + filters.encode_segment(subidNext)
                + "/read/"
                + filters.encode_segment(artidNext)
                + "/"
                + qs
            )
            return context["request"].get_response(webob.exc.HTTPFound(location=path))

    offset = 0
    try:
        offset = int(context["parameters"].get("offset", 0))
    except:
        pass
    if offset > 0:
        context["newer"] = qs + ("&" if qs else "?") + "offset=" + str(offset - 50)
    if len(ids) - offset > 50:
        context["older"] = qs + ("&" if qs else "?") + "offset=" + str(offset + 50)
    oldest = len(ids) - 50
    if oldest > 0 and oldest - offset > 50:
        context["oldest"] = qs + ("&" if qs else "?") + "offset=" + str(oldest)

    articles = []
    for tup in ids[offset : offset + 50]:
        art = feeds.redis.hgetall(tup[2])
        art["subid"] = tup[0]
        art["feedName"] = tup[1]
        art["artid"] = tup[2]
        art["unread"] = tup[3]
        art["articleDate"] = str(datetime.datetime.utcfromtimestamp(float(art["date"])))
        feeds.makeUnicode(art)
        articles.append(art)

    context["feeds"] = feeders
    context["articles"] = articles
    context["qs"] = qs
Пример #6
0
def get_article_content(articleUrl, articleGuid, sub, lstLog=None):
    result = None

#    sys.stderr.write(str(articleUrl) + '\n')
#    sys.stderr.flush()

    url = articleUrl

    # optionally modify URL before fetching the article
    if sub and articleGuid and sub['useGuid'] == '1':
        url = articleGuid
    url = adjust_url(url, sub)

    # use cached copy if present
    key = url
    if sub and sub['xpath']:
        key = key + ' ' + sub['xpath']
    key = "page/" + filters.encode_segment(key)
    if not lstLog:
        result = redis.get(key)
        if result:
            return result.decode('utf-8')

    raw = None
    try:
        if lstLog:
            lstLog.append('fetching url: ')
            lstLog.append(url)
            lstLog.append('\n')

        # fetch the article
        before = time.clock()
        jar = cookielib.CookieJar()
        proc = urllib2.HTTPCookieProcessor(jar)
        redir = LoggingHTTPRedirectHandler(sub, lstLog)
        opener = urllib2.build_opener(proc, redir)
#        opener.addheaders.append(('Accept', '*/*'))
#        f = opener.open(url)
        req = make_request(url)
        f = opener.open(req)
        raw = f.read()
        base = f.geturl()
        mime, params = cgi.parse_header(f.info().getheader('Content-Type'))
        encoding = params.get('charset')#, 'ISO-8859-1')
        f.close()

        if lstLog:
            lstLog.append(str(len(raw)))
            lstLog.append(' bytes retrieved in ')
            lstLog.append(str(time.clock() - before))
            lstLog.append(' seconds, encoding ')
            lstLog.append(str(encoding))
            lstLog.append('\n')

        # tag soup parse the article
        before = time.clock()
        src = BeautifulSoup.BeautifulSoup(raw, "html5lib", from_encoding=encoding)

        if lstLog:
            lstLog.append('parse ')
            lstLog.append(str(time.clock() - before))
            lstLog.append(' seconds\n')

        # sanitize the article markup - remove script, style, and more
        # also convert to xml.dom.minidom so we can use xpath
        before = time.clock()
        doc = soup2dom(src)

        if lstLog:
            lstLog.append('sanitize ')
            lstLog.append(str(time.clock() - before))
            lstLog.append(' seconds\n')

        # extract the parts we want
        before = time.clock()
        parts = []
        if sub and sub['xpath']:
            for path in sub['xpath'].split('\n'):
                parts.extend(xpath.find(path, doc))
        else:
            parts.append(doc.documentElement)

        if lstLog:
            lstLog.append('xpath ')
            lstLog.append(str(time.clock() - before))
            lstLog.append(' seconds\n')
            lstLog.append('xpath ')
            lstLog.append(str(len(parts)))
            lstLog.append(' parts\n')

        # remove class and id attributes so they won't conflict with ours
        # this makes the content smaller too
        # we do this after xpath so xpath can use class and id
        before = time.clock()
        for tag in doc.getElementsByTagName('*'):
            if tag.hasAttribute('class'):
                tag.removeAttribute('class')
            if tag.hasAttribute('id'):
                tag.removeAttribute('id')
            if tag.nodeName == 'a' and tag.hasAttribute('href'):
                tag.setAttribute('target', '_blank')

        if lstLog:
            lstLog.append('clean ')
            lstLog.append(str(time.clock() - before))
            lstLog.append(' seconds\n')

        # make relative URLs absolute so they work in our site
        before = time.clock()
        cache = {}
        for part in parts:
            for attr in [ 'action', 'background', 'cite', 'classid', 'codebase', 'data', 'href', 'longdesc', 'profile', 'src', 'usemap' ]:
                for tag in xpath.find('.//*[@' + attr + ']', part):
                    value = tag.getAttribute(attr)
                    absolute = urlparse.urljoin(base, value)
                    tag.setAttribute(attr, absolute)

        if lstLog:
            lstLog.append('make urls absolute ')
            lstLog.append(str(time.clock() - before))
            lstLog.append(' seconds\n')

        # convert to string
        before = time.clock()
        result = u''
        for part in parts:
            result += u'<div>'
            if part.nodeType == 2:
                result += part.nodeValue
            else:
                result += part.toxml('utf-8').decode('utf-8')
            result += u'</div>'

        if lstLog:
            lstLog.append('to string ')
            lstLog.append(str(time.clock() - before))
            lstLog.append(' seconds\n')

        if lstLog:
            lstLog.append('article size: ')
            lstLog.append(filters.format_IEEE1541(len(result)))
            lstLog.append('\n')

        redis.setex(key, 20*60, result)

        if lstLog and len(result) == 0:
            result += '<pre>\n'
            result += escape('\n'.join(lstLog))
            result += '\n</pre>'

    except Exception, err:
        logging.error("%s", pprint.pformat(err))
        text = str(err)
        if lstLog:
            lstLog.append('exception:\n')
            lstLog.append(text)
            lstLog.append('\nstack:\n')
            lstLog.append(traceback.format_exc())
            lstLog.append('source:\n')
            lstLog.append(repr(raw))
            if hasattr(err, 'read'):
                lstLog.append('\nbody:\n')
                lstLog.append(err.read())
            lstLog.append('\n')
        if result:
            result += '\n'
        else:
            result = ''
        result += '<pre>\n'
        result += escape(str(url))
        result += '\n\n'
        result += escape(text)
        result += '\n</pre>\n<!--\n'
        result += escape(traceback.format_exc())
        result += '\n-->'
Пример #7
0
def update_user_maybe(userid):
    key = "update/" + filters.encode_segment(userid)
    if True or not redis.exists(key):
        update_user(userid)
        redis.setex(key, 600, "1")