Exemplo n.º 1
0
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
    '''Parse a XSPF from a URL, file, stream, or string'''
    result = FeedParserDict()
    result['playlist'] = FeedParserDict()
    if _XML_AVAILABLE:
        result['bozo'] = 0
    if type(handlers) == types.InstanceType:
        handlers = [handlers]
    try:
        f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers)
        data = f.read()
    except Exception, e:
        result['bozo'] = 1
        result['bozo_exception'] = e
        data = ''
        f = None
Exemplo n.º 2
0
def run(num=None):
    feeds, feedfileObject = load()
    mailserver = None
    try:
        # We store the default to address as the first item in the feeds list.
        # Here we take it out and save it for later.
        default_to = ""
        if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] 
        else: ifeeds = feeds
        
        if num: ifeeds = [feeds[num]]
        feednum = 0
        
        for f in ifeeds:
            try: 
                feednum += 1
                if not f.active: continue
                
                if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url)
                r = {}
                try:
                    r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified)
                except TimeoutError:
                    print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url)
                    continue
                
                # Handle various status conditions, as required
                if 'status' in r:
                    if r.status == 301: f.url = r['url']
                    elif r.status == 410:
                        print >>warn, "W: feed gone; deleting", f.url
                        feeds.remove(f)
                        continue
                
                http_status = r.get('status', 200)
                if VERBOSE > 1: print >>warn, "I: http status", http_status
                http_headers = r.get('headers', {
                  'content-type': 'application/rss+xml', 
                  'content-length':'1'})
                exc_type = r.get("bozo_exception", Exception()).__class__
                if http_status != 304 and not r.entries and not r.get('version', ''):
                    if http_status not in [200, 302]: 
                        print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url)

                    elif contains(http_headers.get('content-type', 'rss'), 'html'):
                        print >>warn, "W: looks like HTML [%d] %s"  % (feednum, f.url)

                    elif http_headers.get('content-length', '1') == '0':
                        print >>warn, "W: empty page [%d] %s" % (feednum, f.url)

                    elif hasattr(socket, 'timeout') and exc_type == socket.timeout:
                        print >>warn, "W: timed out on [%d] %s" % (feednum, f.url)
                    
                    elif exc_type == IOError:
                        print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url)
                    
                    elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error:
                        print >>warn, "W: broken compression [%d] %s" % (feednum, f.url)
                    
                    elif exc_type in socket_errors:
                        exc_reason = r.bozo_exception.args[1]
                        print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)

                    elif exc_type == urllib2.URLError:
                        if r.bozo_exception.reason.__class__ in socket_errors:
                            exc_reason = r.bozo_exception.reason.args[1]
                        else:
                            exc_reason = r.bozo_exception.reason
                        print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)
                    
                    elif exc_type == AttributeError:
                        print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url)
                    
                    elif exc_type == KeyboardInterrupt:
                        raise r.bozo_exception
                        
                    elif r.bozo:
                        print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process"))

                    else:
                        print >>warn, "=== rss2email encountered a problem with this feed ==="
                        print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
                        print >>warn, "=== If this occurs repeatedly, send this to [email protected] ==="
                        print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url
                        print >>warn, r
                        print >>warn, "rss2email", __version__
                        print >>warn, "feedparser", feedparser.__version__
                        print >>warn, "html2text", h2t.__version__
                        print >>warn, "Python", sys.version
                        print >>warn, "=== END HERE ==="
                    continue
                
                r.entries.reverse()
                
                for entry in r.entries:
                    id = getID(entry)
                    
                    # If TRUST_GUID isn't set, we get back hashes of the content.
                    # Instead of letting these run wild, we put them in context
                    # by associating them with the actual ID (if it exists).
                    
                    frameid = entry.get('id')
                    if not(frameid): frameid = id
                    if type(frameid) is DictType:
                        frameid = frameid.values()[0]
                    
                    # If this item's ID is in our database
                    # then it's already been sent
                    # and we don't need to do anything more.
                    
                    if frameid in f.seen:
                        if f.seen[frameid] == id: continue

                    if not (f.to or default_to):
                        print "No default email address defined. Please run 'r2e email emailaddress'"
                        print "Ignoring feed %s" % f.url
                        break
                    
                    if 'title_detail' in entry and entry.title_detail:
                        title = entry.title_detail.value
                        if contains(entry.title_detail.type, 'html'):
                            title = html2text(title)
                    else:
                        title = getContent(entry)[:70]

                    title = title.replace("\n", " ").strip()
                    
                    when = time.gmtime()

                    if DATE_HEADER:
                        for datetype in DATE_HEADER_ORDER:
                            kind = datetype+"_parsed"
                            if kind in entry and entry[kind]: when = entry[kind]
                        
                    link = entry.get('link', "")
                    
                    from_addr = getEmail(r, entry)
                    
                    name = h2t.unescape(getName(r, entry))
                    fromhdr = formataddr((name, from_addr,))
                    tohdr = (f.to or default_to)
                    subjecthdr = title
                    datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", when)
                    useragenthdr = "rss2email"
                    
                    # Add post tags, if available
                    tagline = ""
                    if 'tags' in entry:
                        tags = entry.get('tags')
                        taglist = []
                        if tags:
                            for tag in tags:
                                taglist.append(tag['term'])
                        if taglist:
                            tagline = ",".join(taglist)
                    
                    extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'Message-ID': '<%s>' % hashlib.sha1(id.encode('utf-8')).hexdigest(), 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline, 'X-MUNGED-FROM': getMungedFrom(r), 'References': ''}
                    if BONUS_HEADER != '':
                        for hdr in BONUS_HEADER.strip().splitlines():
                            pos = hdr.strip().find(':')
                            if pos > 0:
                                extraheaders[hdr[:pos]] = hdr[pos+1:].strip()
                            else:
                                print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER 
                    
                    entrycontent = getContent(entry, HTMLOK=HTML_MAIL)
                    contenttype = 'plain'
                    content = ''
                    if THREAD_ON_TAGS and len(tagline):
                        extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(t.strip().encode('utf-8')).hexdigest() for t in tagline.split(',')])
                    if USE_CSS_STYLING and HTML_MAIL:
                        contenttype = 'html'
                        content = "<html>\n" 
                        content += '<head><meta http-equiv="Content-Type" content="text/html"><style>' + STYLE_SHEET + '</style></head>\n'
                        content += '<body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;">\n'
                        content += '<div id="entry">\n'
                        content += '<h1 class="header"'
                        content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n'
                        if ishtml(entrycontent):
                            body = entrycontent[1].strip()
                            if SUMMARIZE:
                                content += '<div class="summary">%s</div>' % (summarize(html2text(body, plaintext=True), SUMMARIZE) + "<hr>")
                        else:
                            body = entrycontent.strip()
                            if SUMMARIZE:
                                content += '<div class="summary">%s</div>' % (summarize(body, SUMMARIZE) + "<hr>")
                        if THREAD_ON_LINKS:
                            parser = Parser()
                            parser.feed(body)
                            extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(h.strip().encode('utf-8')).hexdigest() for h in parser.attrs])
                        if INLINE_IMAGES_DATA_URI:
                            parser = Parser(tag='img', attr='src')
                            parser.feed(body)
                            for src in parser.attrs:
                                try:
                                    img = feedparser._open_resource(src, None, None, feedparser.USER_AGENT, link, [], {})
                                    data = img.read()
                                    if hasattr(img, 'headers'):
                                        headers = dict((k.lower(), v) for k, v in dict(img.headers).items())
                                        ctype = headers.get('content-type', None)
                                        if ctype and INLINE_IMAGES_DATA_URI:
                                            body = body.replace(src,'data:%s;base64,%s' % (ctype, base64.b64encode(data)))
                                except:
                                    print >>warn, "Could not load image: %s" % src
                                    pass
                        if body != '':  
                            content += '<div id="body">\n' + body + '</div>\n'
                        content += '\n<p class="footer">URL: <a href="'+link+'">'+link+'</a>'
                        if hasattr(entry,'enclosures'):
                            for enclosure in entry.enclosures:
                                if (hasattr(enclosure, 'url') and enclosure.url != ""):
                                    content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n")
                                if (hasattr(enclosure, 'src') and enclosure.src != ""):
                                    content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n')
                        if 'links' in entry:
                            for extralink in entry.links:
                                if ('rel' in extralink) and extralink['rel'] == u'via':
                                    extraurl = extralink['href']
                                    extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/')
                                    viatitle = extraurl
                                    if ('title' in extralink):
                                        viatitle = extralink['title']
                                    content += '<br/>Via: <a href="'+extraurl+'">'+viatitle+'</a>\n'
                        content += '</p></div>\n'
                        content += "\n\n</body></html>"
                    else:   
                        if ishtml(entrycontent):
                            contenttype = 'html'
                            content = "<html>\n" 
                            content = ("<html><body>\n\n" + 
                                       '<h1><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n' +
                                       entrycontent[1].strip() + # drop type tag (HACK: bad abstraction)
                                       '<p>URL: <a href="'+link+'">'+link+'</a></p>' )
                                       
                            if hasattr(entry,'enclosures'):
                                for enclosure in entry.enclosures:
                                    if enclosure.url != "":
                                        content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n")
                            if 'links' in entry:
                                for extralink in entry.links:
                                    if ('rel' in extralink) and extralink['rel'] == u'via':
                                        content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n'
                                                                
                            content += ("\n</body></html>")
                        else:
                            content = entrycontent.strip() + "\n\nURL: "+link
                            if hasattr(entry,'enclosures'):
                                for enclosure in entry.enclosures:
                                    if enclosure.url != "":
                                        content += ('\nEnclosure: ' + enclosure.url + "\n")
                            if 'links' in entry:
                                for extralink in entry.links:
                                    if ('rel' in extralink) and extralink['rel'] == u'via':
                                        content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n'

                    mailserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, when, extraheaders, mailserver, f.folder)
            
                    f.seen[frameid] = id
                    
                f.etag, f.modified = r.get('etag', None), r.get('modified', None)
            except (KeyboardInterrupt, SystemExit):
                raise
            except:
                print >>warn, "=== rss2email encountered a problem with this feed ==="
                print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
                print >>warn, "=== If this occurs repeatedly, send this to [email protected] ==="
                print >>warn, "E: could not parse", f.url
                traceback.print_exc(file=warn)
                print >>warn, "rss2email", __version__
                print >>warn, "feedparser", feedparser.__version__
                print >>warn, "html2text", h2t.__version__
                print >>warn, "Python", sys.version
                print >>warn, "=== END HERE ==="
                continue

    finally:        
        unlock(feeds, feedfileObject)
        if mailserver:
            if IMAP_MARK_AS_READ:
                for folder in IMAP_MARK_AS_READ:
                    mailserver.select(folder)
                    res, data = mailserver.search(None, '(UNSEEN UNFLAGGED)')
                    if res == 'OK':
                        items = data[0].split()
                        for i in items:
                            res, data = mailserver.fetch(i, "(UID)")
                            if data[0]:
                                u = uid(data[0])
                                res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Seen)')
            if IMAP_MOVE_READ_TO:
                typ, data = mailserver.list(pattern='*')
                # Parse folder listing as a CSV dialect (automatically removes quotes)
                reader = csv.reader(StringIO.StringIO('\n'.join(data)),dialect='mailboxlist')
                # Iterate over each folder
                for row in reader:
                    folder = row[-1:][0]
                    if folder == IMAP_MOVE_READ_TO or '\Noselect' in row[0]:
                        continue
                    mailserver.select(folder)
                    yesterday = (datetime.now() - timedelta(days=1)).strftime("%d-%b-%Y")
                    res, data = mailserver.search(None, '(SEEN BEFORE %s UNFLAGGED)' % yesterday)
                    if res == 'OK':
                        items = data[0].split()
                        for i in items:
                            res, data = mailserver.fetch(i, "(UID)")
                            if data[0]:
                                u = uid(data[0])
                                res, data = mailserver.uid('COPY', u, IMAP_MOVE_READ_TO)
                                if res == 'OK':
                                    res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Deleted)')
                                    mailserver.expunge()
            try:
                mailserver.quit()
            except:
                mailserver.logout()
Exemplo n.º 3
0
    def getLink(self):
        """Reads the HTML page and extracts the link, title and body."""

        if not self.children.intersection(self.attrs):
            return      # mandatory child element missing

        self.loadCache()
        try:
            f = feedparser._open_resource(self.uri, self.etag, self.modified,
                USER_AGENT, None, [], {}
            )
            html = f.read()
        except Exception as e:
            sys.stderr.write('Getting page %s: %s\n' % (self.uri, e))
            return

        if getattr(f, 'status', None) == 304 or not html:
            # not modified or empty page
            return

        # save HTTP headers
        if hasattr(f, 'info'):
            info = f.info()
            etag = info.getheader('ETag')
            modified = info.getheader('Last-Modified')
            if modified:
                modified = feedparser._parse_date(modified)
            self.saveCache(etag, modified)

            # if the page is compressed, decompress it
            ce = info.getheader('Content-Encoding', '')
            if ce == 'gzip':
                try:
                    import gzip
                    import StringIO
                    html = gzip.GzipFile(fileobj=StringIO.StringIO(html)).read()
                except Exception as e:
                    sys.stderr.write('Unzipping page %s: %s\n' % (self.uri, e))
                    return
            elif ce == 'deflate':
                try:
                    import zlib
                    html = zlib.decompress(html, -zlib.MAX_WBITS)
                except Exception as e:
                    sys.stderr.write('Inflating page %s: %s\n' % (self.uri, e))
                    return

        # resolve relative URIs
        html = feedparser._resolveRelativeURIs(html, self.uri, self.encoding, 'text/html')

        if hasattr(f, 'headers'):
            charsets = [c for c in feedparser._getCharacterEncoding(f.headers, html) if c]
        else:
            charsets = [self.encoding]
        for charset in charsets:
            try:
                html = html.decode(charset)
                break
            except UnicodeDecodeError:
                pass
            except LookupError:
                pass

        if 'regex' in self.attrs:
            self.match_regex(html)
        else:
            self.match_xpath(html)
Exemplo n.º 4
0
    fileext = translateType.get( type_, "")

    defaults = NSUserDefaults.standardUserDefaults()
    cache = False
    try:
        cache = bool(defaults.objectForKey_( u'optCache'))
    except StandardError, err:
        print "ERROR reading defaults.", repr(err)
    
    if cache:
        if not nsurl.isFileURL():
            nsurl = cache_url(nsurl, fileext)

    url = NSURL2str(nsurl)

    fob = feedparser._open_resource(url, None, None, CactusVersion.user_agent, None, [], {})
    s = fob.read()
    fob.close()

    if type_ == CactusOPMLType:
        # this is a quick & dirty approach and should be applied much more carefully
        # than it is now... perhaps those errors get corrected and <directivecache>
        # will be a propper node.

        # clean up bogative xml declaration. OPML-Editor, I'm looking at you...
        if s.startswith("""<?xml encoding="ISO-8859-1" version="1.0"?>"""):
            s = s.replace("""<?xml encoding="ISO-8859-1" version="1.0"?>""",
                          """<?xml version="1.0" encoding="ISO-8859-1"?>""")

            if kwlog:
                print "\nBOGUS XML DELARATION REPLACED\n"
Exemplo n.º 5
0
 def test_unicode_2(self):
     s = u'<feed><item><title>t\u00e9xt</title></item></feed>'
     r = feedparser._open_resource(s, '', '', '', '', [], {})
     self.assertEqual(s.encode('utf-8'), r.read())
Exemplo n.º 6
0
 def test_string(self):
     s = '<feed><item><title>text</title></item></feed>'
     r = feedparser._open_resource(s, '', '', '', '', [], {})
     self.assertEqual(s.encode('utf-8'), r.read())
Exemplo n.º 7
0
 def test_fileobj(self):
     r = feedparser._open_resource(sys.stdin, '', '', '', '', [], {})
     self.assertTrue(r is sys.stdin)
 def test_unicode_1(self):
     s = u'<feed><item><title>text</title></item></feed>'
     r = feedparser._open_resource(s, '', '', '', '', [], {})
     self.assertEqual(s.encode('utf-8'), r.read())
 def test_bytes(self):
     s = '<feed><item><title>text</title></item></feed>'.encode('utf-8')
     r = feedparser._open_resource(s, '', '', '', '', [], {})
     self.assertEqual(s, r.read())
 def test_fileobj(self):
     r = feedparser._open_resource(sys.stdin, '', '', '', '', [], {})
     self.assertTrue(r is sys.stdin)
Exemplo n.º 11
0
 def test_unicode_2(self):
     s = u"<feed><item><title>t\u00e9xt</title></item></feed>"
     r = feedparser._open_resource(s, "", "", "", "", [], {})
     self.assertEqual(s.encode("utf-8"), r.read())
Exemplo n.º 12
0
 def test_string(self):
     s = "<feed><item><title>text</title></item></feed>"
     r = feedparser._open_resource(s, "", "", "", "", [], {})
     self.assertEqual(s.encode("utf-8"), r.read())
Exemplo n.º 13
0
def run(num=None):
    feeds, feedfileObject = load()
    mailserver = None
    try:
        # We store the default to address as the first item in the feeds list.
        # Here we take it out and save it for later.
        default_to = ""
        if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] 
        else: ifeeds = feeds
        
        if num: ifeeds = [feeds[num]]
        feednum = 0
        
        for f in ifeeds:
            try: 
                feednum += 1
                if not f.active: continue
                
                if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url)
                r = {}
                try:
                    r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified)
                except TimeoutError:
                    print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url)
                    continue
                
                # Handle various status conditions, as required
                if 'status' in r:
                    if r.status == 301: f.url = r['url']
                    elif r.status == 410:
                        print >>warn, "W: feed gone; deleting", f.url
                        feeds.remove(f)
                        continue
                
                http_status = r.get('status', 200)
                if VERBOSE > 1: print >>warn, "I: http status", http_status
                http_headers = r.get('headers', {
                  'content-type': 'application/rss+xml', 
                  'content-length':'1'})
                exc_type = r.get("bozo_exception", Exception()).__class__
                if http_status != 304 and not r.entries and not r.get('version', ''):
                    if http_status not in [200, 302]: 
                        print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url)

                    elif contains(http_headers.get('content-type', 'rss'), 'html'):
                        print >>warn, "W: looks like HTML [%d] %s"  % (feednum, f.url)

                    elif http_headers.get('content-length', '1') == '0':
                        print >>warn, "W: empty page [%d] %s" % (feednum, f.url)

                    elif hasattr(socket, 'timeout') and exc_type == socket.timeout:
                        print >>warn, "W: timed out on [%d] %s" % (feednum, f.url)
                    
                    elif exc_type == IOError:
                        print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url)
                    
                    elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error:
                        print >>warn, "W: broken compression [%d] %s" % (feednum, f.url)
                    
                    elif exc_type in socket_errors:
                        exc_reason = r.bozo_exception.args[1]
                        print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)

                    elif exc_type == urllib2.URLError:
                        if r.bozo_exception.reason.__class__ in socket_errors:
                            exc_reason = r.bozo_exception.reason.args[1]
                        else:
                            exc_reason = r.bozo_exception.reason
                        print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url)
                    
                    elif exc_type == AttributeError:
                        print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url)
                    
                    elif exc_type == KeyboardInterrupt:
                        raise r.bozo_exception
                        
                    elif r.bozo:
                        print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process"))

                    else:
                        print >>warn, "=== rss2email encountered a problem with this feed ==="
                        print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
                        print >>warn, "=== If this occurs repeatedly, send this to [email protected] ==="
                        print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url
                        print >>warn, r
                        print >>warn, "rss2email", __version__
                        print >>warn, "feedparser", feedparser.__version__
                        print >>warn, "html2text", h2t.__version__
                        print >>warn, "Python", sys.version
                        print >>warn, "=== END HERE ==="
                    continue
                
                r.entries.reverse()
                
                for entry in r.entries:
                    id = getID(entry)
                    
                    # If TRUST_GUID isn't set, we get back hashes of the content.
                    # Instead of letting these run wild, we put them in context
                    # by associating them with the actual ID (if it exists).
                    
                    frameid = entry.get('id')
                    if not(frameid): frameid = id
                    if type(frameid) is DictType:
                        frameid = frameid.values()[0]
                    
                    # If this item's ID is in our database
                    # then it's already been sent
                    # and we don't need to do anything more.
                    
                    if frameid in f.seen:
                        if f.seen[frameid] == id: continue

                    if not (f.to or default_to):
                        print "No default email address defined. Please run 'r2e email emailaddress'"
                        print "Ignoring feed %s" % f.url
                        break
                    
                    if 'title_detail' in entry and entry.title_detail:
                        title = entry.title_detail.value
                        if contains(entry.title_detail.type, 'html'):
                            title = html2text(title)
                    else:
                        title = getContent(entry)[:70]

                    title = title.replace("\n", " ").strip()
                    
                    datetime = time.gmtime()

                    if DATE_HEADER:
                        for datetype in DATE_HEADER_ORDER:
                            kind = datetype+"_parsed"
                            if kind in entry and entry[kind]: datetime = entry[kind]
                        
                    link = entry.get('link', "")
                    
                    from_addr = getEmail(r, entry)
                    
                    name = h2t.unescape(getName(r, entry))
                    fromhdr = formataddr((name, from_addr,))
                    tohdr = (f.to or default_to)
                    subjecthdr = title
                    datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime)
                    useragenthdr = "rss2email"
                    
                    # Add post tags, if available
                    tagline = ""
                    if 'tags' in entry:
                        tags = entry.get('tags')
                        taglist = []
                        if tags:
                            for tag in tags:
                                taglist.append(tag['term'])
                        if taglist:
                            tagline = ",".join(taglist)
                    
                    extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'Message-ID': '<%s>' % hashlib.sha1(id.encode('utf-8')).hexdigest(), 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline, 'X-MUNGED-FROM': getMungedFrom(r), 'References': ''}
                    if BONUS_HEADER != '':
                        for hdr in BONUS_HEADER.strip().splitlines():
                            pos = hdr.strip().find(':')
                            if pos > 0:
                                extraheaders[hdr[:pos]] = hdr[pos+1:].strip()
                            else:
                                print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER 
                    
                    entrycontent = getContent(entry, HTMLOK=HTML_MAIL)
                    contenttype = 'plain'
                    content = ''
                    if THREAD_ON_TAGS and len(tagline):
                        extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(t.strip().encode('utf-8')).hexdigest() for t in tagline.split(',')])
                    if USE_CSS_STYLING and HTML_MAIL:
                        contenttype = 'html'
                        content = "<html>\n" 
                        content += '<head><meta http-equiv="Content-Type" content="text/html"><style>' + STYLE_SHEET + '</style></head>\n'
                        content += '<body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;">\n'
                        content += '<div id="entry">\n'
                        content += '<h1 class="header"'
                        content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n'
                        if ishtml(entrycontent):
                            body = entrycontent[1].strip()
                            if SUMMARIZE:
                                content += '<div class="summary">%s</div>' % (summarize(html2text(body, plaintext=True), SUMMARIZE) + "<hr>")
                        else:
                            body = entrycontent.strip()
                            if SUMMARIZE:
                                content += '<div class="summary">%s</div>' % (summarize(body, SUMMARIZE) + "<hr>")
                        if THREAD_ON_LINKS:
                            parser = Parser()
                            parser.feed(body)
                            extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(h.strip().encode('utf-8')).hexdigest() for h in parser.attrs])
                        if INLINE_IMAGES_DATA_URI:
                            parser = Parser(tag='img', attr='src')
                            parser.feed(body)
                            for src in parser.attrs:
                                try:
                                    img = feedparser._open_resource(src, None, None, feedparser.USER_AGENT, link, [], {})
                                    data = img.read()
                                    if hasattr(img, 'headers'):
                                        headers = dict((k.lower(), v) for k, v in dict(img.headers).items())
                                        ctype = headers.get('content-type', None)
                                        if ctype and INLINE_IMAGES_DATA_URI:
                                            body = body.replace(src,'data:%s;base64,%s' % (ctype, base64.b64encode(data)))
                                except:
                                    print >>warn, "Could not load image: %s" % src
                                    pass
                        if body != '':  
                            content += '<div id="body">\n' + body + '</div>\n'
                        content += '\n<p class="footer">URL: <a href="'+link+'">'+link+'</a>'
                        if hasattr(entry,'enclosures'):
                            for enclosure in entry.enclosures:
                                if (hasattr(enclosure, 'url') and enclosure.url != ""):
                                    content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n")
                                if (hasattr(enclosure, 'src') and enclosure.src != ""):
                                    content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n')
                        if 'links' in entry:
                            for extralink in entry.links:
                                if ('rel' in extralink) and extralink['rel'] == u'via':
                                    extraurl = extralink['href']
                                    extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/')
                                    viatitle = extraurl
                                    if ('title' in extralink):
                                        viatitle = extralink['title']
                                    content += '<br/>Via: <a href="'+extraurl+'">'+viatitle+'</a>\n'
                        content += '</p></div>\n'
                        content += "\n\n</body></html>"
                    else:   
                        if ishtml(entrycontent):
                            contenttype = 'html'
                            content = "<html>\n" 
                            content = ("<html><body>\n\n" + 
                                       '<h1><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n' +
                                       entrycontent[1].strip() + # drop type tag (HACK: bad abstraction)
                                       '<p>URL: <a href="'+link+'">'+link+'</a></p>' )
                                       
                            if hasattr(entry,'enclosures'):
                                for enclosure in entry.enclosures:
                                    if enclosure.url != "":
                                        content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n")
                            if 'links' in entry:
                                for extralink in entry.links:
                                    if ('rel' in extralink) and extralink['rel'] == u'via':
                                        content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n'
                                                                
                            content += ("\n</body></html>")
                        else:
                            content = entrycontent.strip() + "\n\nURL: "+link
                            if hasattr(entry,'enclosures'):
                                for enclosure in entry.enclosures:
                                    if enclosure.url != "":
                                        content += ('\nEnclosure: ' + enclosure.url + "\n")
                            if 'links' in entry:
                                for extralink in entry.links:
                                    if ('rel' in extralink) and extralink['rel'] == u'via':
                                        content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n'

                    mailserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, datetime, extraheaders, mailserver, f.folder)
            
                    f.seen[frameid] = id
                    
                f.etag, f.modified = r.get('etag', None), r.get('modified', None)
            except (KeyboardInterrupt, SystemExit):
                raise
            except:
                print >>warn, "=== rss2email encountered a problem with this feed ==="
                print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ==="
                print >>warn, "=== If this occurs repeatedly, send this to [email protected] ==="
                print >>warn, "E: could not parse", f.url
                traceback.print_exc(file=warn)
                print >>warn, "rss2email", __version__
                print >>warn, "feedparser", feedparser.__version__
                print >>warn, "html2text", h2t.__version__
                print >>warn, "Python", sys.version
                print >>warn, "=== END HERE ==="
                continue

    finally:        
        unlock(feeds, feedfileObject)
        if mailserver:
            if IMAP_MARK_AS_READ:
                for folder in IMAP_MARK_AS_READ:
                    mailserver.select(folder)
                    res, data = mailserver.search(None, '(UNSEEN UNFLAGGED)')
                    if res == 'OK':
                        items = data[0].split()
                        for i in items:
                            res, data = mailserver.fetch(i, "(UID)")
                            if data[0]:
                                u = uid(data[0])
                                res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Seen)')
            if IMAP_MOVE_READ_TO:
                typ, data = mailserver.list(pattern='*')
                # Parse folder listing as a CSV dialect (automatically removes quotes)
                reader = csv.reader(StringIO.StringIO('\n'.join(data)),dialect='mailboxlist')
                # Iterate over each folder
                for row in reader:
                    folder = row[-1:][0]
                    if folder == IMAP_MOVE_READ_TO or '\Noselect' in row[0]:
                        continue
                    mailserver.select(folder)
                    res, data = mailserver.search(None, '(SEEN UNFLAGGED)')
                    if res == 'OK':
                        items = data[0].split()
                        for i in items:
                            res, data = mailserver.fetch(i, "(UID)")
                            if data[0]:
                                u = uid(data[0])
                                res, data = mailserver.uid('COPY', u, IMAP_MOVE_READ_TO)
                                if res == 'OK':
                                    res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Deleted)')
                                    mailserver.expunge()
            try:
                mailserver.quit()
            except:
                mailserver.logout()
Exemplo n.º 14
0
    if cache:
        if not nsurl.isFileURL():
            nsurl = cache_url(nsurl, fileext)

    url = NSURL2str(nsurl)

    if 0:
        # does not work with file urls
        r = requests.get(url)
        s = r.content
        headers = r.headers
        r.close()
    else:
        # fob = feedparser._open_resource(url, None, None, CactusVersion.user_agent, None, [], {})
        fob = feedparser._open_resource(url, None, None, None, None, [], {})
        s = fob.read()
        fob.close()

    # check for gzip compressed opml file
    # pdb.set_trace()
    try:
        if len(s) > 2:
            if ord(s[0]) == 0x1f:
                if ord(s[1]) == 0x8b:
                    unzipped = gzip.GzipFile(
                        fileobj=StringIO.StringIO(s)).read()
                    s = unzipped
    except Exception:
        pass