def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]): '''Parse a XSPF from a URL, file, stream, or string''' result = FeedParserDict() result['playlist'] = FeedParserDict() if _XML_AVAILABLE: result['bozo'] = 0 if type(handlers) == types.InstanceType: handlers = [handlers] try: f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers) data = f.read() except Exception, e: result['bozo'] = 1 result['bozo_exception'] = e data = '' f = None
def run(num=None): feeds, feedfileObject = load() mailserver = None try: # We store the default to address as the first item in the feeds list. # Here we take it out and save it for later. default_to = "" if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] else: ifeeds = feeds if num: ifeeds = [feeds[num]] feednum = 0 for f in ifeeds: try: feednum += 1 if not f.active: continue if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url) r = {} try: r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified) except TimeoutError: print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url) continue # Handle various status conditions, as required if 'status' in r: if r.status == 301: f.url = r['url'] elif r.status == 410: print >>warn, "W: feed gone; deleting", f.url feeds.remove(f) continue http_status = r.get('status', 200) if VERBOSE > 1: print >>warn, "I: http status", http_status http_headers = r.get('headers', { 'content-type': 'application/rss+xml', 'content-length':'1'}) exc_type = r.get("bozo_exception", Exception()).__class__ if http_status != 304 and not r.entries and not r.get('version', ''): if http_status not in [200, 302]: print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url) elif contains(http_headers.get('content-type', 'rss'), 'html'): print >>warn, "W: looks like HTML [%d] %s" % (feednum, f.url) elif http_headers.get('content-length', '1') == '0': print >>warn, "W: empty page [%d] %s" % (feednum, f.url) elif hasattr(socket, 'timeout') and exc_type == socket.timeout: print >>warn, "W: timed out on [%d] %s" % (feednum, f.url) elif exc_type == IOError: print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url) elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error: print >>warn, "W: broken compression [%d] %s" % (feednum, f.url) elif exc_type in socket_errors: exc_reason = r.bozo_exception.args[1] print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) elif exc_type == urllib2.URLError: if r.bozo_exception.reason.__class__ in socket_errors: exc_reason = r.bozo_exception.reason.args[1] else: exc_reason = r.bozo_exception.reason print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) elif exc_type == AttributeError: print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url) elif exc_type == KeyboardInterrupt: raise r.bozo_exception elif r.bozo: print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process")) else: print >>warn, "=== rss2email encountered a problem with this feed ===" print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ===" print >>warn, "=== If this occurs repeatedly, send this to [email protected] ===" print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url print >>warn, r print >>warn, "rss2email", __version__ print >>warn, "feedparser", feedparser.__version__ print >>warn, "html2text", h2t.__version__ print >>warn, "Python", sys.version print >>warn, "=== END HERE ===" continue r.entries.reverse() for entry in r.entries: id = getID(entry) # If TRUST_GUID isn't set, we get back hashes of the content. # Instead of letting these run wild, we put them in context # by associating them with the actual ID (if it exists). frameid = entry.get('id') if not(frameid): frameid = id if type(frameid) is DictType: frameid = frameid.values()[0] # If this item's ID is in our database # then it's already been sent # and we don't need to do anything more. if frameid in f.seen: if f.seen[frameid] == id: continue if not (f.to or default_to): print "No default email address defined. Please run 'r2e email emailaddress'" print "Ignoring feed %s" % f.url break if 'title_detail' in entry and entry.title_detail: title = entry.title_detail.value if contains(entry.title_detail.type, 'html'): title = html2text(title) else: title = getContent(entry)[:70] title = title.replace("\n", " ").strip() when = time.gmtime() if DATE_HEADER: for datetype in DATE_HEADER_ORDER: kind = datetype+"_parsed" if kind in entry and entry[kind]: when = entry[kind] link = entry.get('link', "") from_addr = getEmail(r, entry) name = h2t.unescape(getName(r, entry)) fromhdr = formataddr((name, from_addr,)) tohdr = (f.to or default_to) subjecthdr = title datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", when) useragenthdr = "rss2email" # Add post tags, if available tagline = "" if 'tags' in entry: tags = entry.get('tags') taglist = [] if tags: for tag in tags: taglist.append(tag['term']) if taglist: tagline = ",".join(taglist) extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'Message-ID': '<%s>' % hashlib.sha1(id.encode('utf-8')).hexdigest(), 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline, 'X-MUNGED-FROM': getMungedFrom(r), 'References': ''} if BONUS_HEADER != '': for hdr in BONUS_HEADER.strip().splitlines(): pos = hdr.strip().find(':') if pos > 0: extraheaders[hdr[:pos]] = hdr[pos+1:].strip() else: print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER entrycontent = getContent(entry, HTMLOK=HTML_MAIL) contenttype = 'plain' content = '' if THREAD_ON_TAGS and len(tagline): extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(t.strip().encode('utf-8')).hexdigest() for t in tagline.split(',')]) if USE_CSS_STYLING and HTML_MAIL: contenttype = 'html' content = "<html>\n" content += '<head><meta http-equiv="Content-Type" content="text/html"><style>' + STYLE_SHEET + '</style></head>\n' content += '<body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;">\n' content += '<div id="entry">\n' content += '<h1 class="header"' content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n' if ishtml(entrycontent): body = entrycontent[1].strip() if SUMMARIZE: content += '<div class="summary">%s</div>' % (summarize(html2text(body, plaintext=True), SUMMARIZE) + "<hr>") else: body = entrycontent.strip() if SUMMARIZE: content += '<div class="summary">%s</div>' % (summarize(body, SUMMARIZE) + "<hr>") if THREAD_ON_LINKS: parser = Parser() parser.feed(body) extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(h.strip().encode('utf-8')).hexdigest() for h in parser.attrs]) if INLINE_IMAGES_DATA_URI: parser = Parser(tag='img', attr='src') parser.feed(body) for src in parser.attrs: try: img = feedparser._open_resource(src, None, None, feedparser.USER_AGENT, link, [], {}) data = img.read() if hasattr(img, 'headers'): headers = dict((k.lower(), v) for k, v in dict(img.headers).items()) ctype = headers.get('content-type', None) if ctype and INLINE_IMAGES_DATA_URI: body = body.replace(src,'data:%s;base64,%s' % (ctype, base64.b64encode(data))) except: print >>warn, "Could not load image: %s" % src pass if body != '': content += '<div id="body">\n' + body + '</div>\n' content += '\n<p class="footer">URL: <a href="'+link+'">'+link+'</a>' if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if (hasattr(enclosure, 'url') and enclosure.url != ""): content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n") if (hasattr(enclosure, 'src') and enclosure.src != ""): content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n') if 'links' in entry: for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': extraurl = extralink['href'] extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/') viatitle = extraurl if ('title' in extralink): viatitle = extralink['title'] content += '<br/>Via: <a href="'+extraurl+'">'+viatitle+'</a>\n' content += '</p></div>\n' content += "\n\n</body></html>" else: if ishtml(entrycontent): contenttype = 'html' content = "<html>\n" content = ("<html><body>\n\n" + '<h1><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n' + entrycontent[1].strip() + # drop type tag (HACK: bad abstraction) '<p>URL: <a href="'+link+'">'+link+'</a></p>' ) if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if enclosure.url != "": content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n") if 'links' in entry: for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n' content += ("\n</body></html>") else: content = entrycontent.strip() + "\n\nURL: "+link if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if enclosure.url != "": content += ('\nEnclosure: ' + enclosure.url + "\n") if 'links' in entry: for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n' mailserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, when, extraheaders, mailserver, f.folder) f.seen[frameid] = id f.etag, f.modified = r.get('etag', None), r.get('modified', None) except (KeyboardInterrupt, SystemExit): raise except: print >>warn, "=== rss2email encountered a problem with this feed ===" print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ===" print >>warn, "=== If this occurs repeatedly, send this to [email protected] ===" print >>warn, "E: could not parse", f.url traceback.print_exc(file=warn) print >>warn, "rss2email", __version__ print >>warn, "feedparser", feedparser.__version__ print >>warn, "html2text", h2t.__version__ print >>warn, "Python", sys.version print >>warn, "=== END HERE ===" continue finally: unlock(feeds, feedfileObject) if mailserver: if IMAP_MARK_AS_READ: for folder in IMAP_MARK_AS_READ: mailserver.select(folder) res, data = mailserver.search(None, '(UNSEEN UNFLAGGED)') if res == 'OK': items = data[0].split() for i in items: res, data = mailserver.fetch(i, "(UID)") if data[0]: u = uid(data[0]) res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Seen)') if IMAP_MOVE_READ_TO: typ, data = mailserver.list(pattern='*') # Parse folder listing as a CSV dialect (automatically removes quotes) reader = csv.reader(StringIO.StringIO('\n'.join(data)),dialect='mailboxlist') # Iterate over each folder for row in reader: folder = row[-1:][0] if folder == IMAP_MOVE_READ_TO or '\Noselect' in row[0]: continue mailserver.select(folder) yesterday = (datetime.now() - timedelta(days=1)).strftime("%d-%b-%Y") res, data = mailserver.search(None, '(SEEN BEFORE %s UNFLAGGED)' % yesterday) if res == 'OK': items = data[0].split() for i in items: res, data = mailserver.fetch(i, "(UID)") if data[0]: u = uid(data[0]) res, data = mailserver.uid('COPY', u, IMAP_MOVE_READ_TO) if res == 'OK': res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Deleted)') mailserver.expunge() try: mailserver.quit() except: mailserver.logout()
def getLink(self): """Reads the HTML page and extracts the link, title and body.""" if not self.children.intersection(self.attrs): return # mandatory child element missing self.loadCache() try: f = feedparser._open_resource(self.uri, self.etag, self.modified, USER_AGENT, None, [], {} ) html = f.read() except Exception as e: sys.stderr.write('Getting page %s: %s\n' % (self.uri, e)) return if getattr(f, 'status', None) == 304 or not html: # not modified or empty page return # save HTTP headers if hasattr(f, 'info'): info = f.info() etag = info.getheader('ETag') modified = info.getheader('Last-Modified') if modified: modified = feedparser._parse_date(modified) self.saveCache(etag, modified) # if the page is compressed, decompress it ce = info.getheader('Content-Encoding', '') if ce == 'gzip': try: import gzip import StringIO html = gzip.GzipFile(fileobj=StringIO.StringIO(html)).read() except Exception as e: sys.stderr.write('Unzipping page %s: %s\n' % (self.uri, e)) return elif ce == 'deflate': try: import zlib html = zlib.decompress(html, -zlib.MAX_WBITS) except Exception as e: sys.stderr.write('Inflating page %s: %s\n' % (self.uri, e)) return # resolve relative URIs html = feedparser._resolveRelativeURIs(html, self.uri, self.encoding, 'text/html') if hasattr(f, 'headers'): charsets = [c for c in feedparser._getCharacterEncoding(f.headers, html) if c] else: charsets = [self.encoding] for charset in charsets: try: html = html.decode(charset) break except UnicodeDecodeError: pass except LookupError: pass if 'regex' in self.attrs: self.match_regex(html) else: self.match_xpath(html)
fileext = translateType.get( type_, "") defaults = NSUserDefaults.standardUserDefaults() cache = False try: cache = bool(defaults.objectForKey_( u'optCache')) except StandardError, err: print "ERROR reading defaults.", repr(err) if cache: if not nsurl.isFileURL(): nsurl = cache_url(nsurl, fileext) url = NSURL2str(nsurl) fob = feedparser._open_resource(url, None, None, CactusVersion.user_agent, None, [], {}) s = fob.read() fob.close() if type_ == CactusOPMLType: # this is a quick & dirty approach and should be applied much more carefully # than it is now... perhaps those errors get corrected and <directivecache> # will be a propper node. # clean up bogative xml declaration. OPML-Editor, I'm looking at you... if s.startswith("""<?xml encoding="ISO-8859-1" version="1.0"?>"""): s = s.replace("""<?xml encoding="ISO-8859-1" version="1.0"?>""", """<?xml version="1.0" encoding="ISO-8859-1"?>""") if kwlog: print "\nBOGUS XML DELARATION REPLACED\n"
def test_unicode_2(self): s = u'<feed><item><title>t\u00e9xt</title></item></feed>' r = feedparser._open_resource(s, '', '', '', '', [], {}) self.assertEqual(s.encode('utf-8'), r.read())
def test_string(self): s = '<feed><item><title>text</title></item></feed>' r = feedparser._open_resource(s, '', '', '', '', [], {}) self.assertEqual(s.encode('utf-8'), r.read())
def test_fileobj(self): r = feedparser._open_resource(sys.stdin, '', '', '', '', [], {}) self.assertTrue(r is sys.stdin)
def test_unicode_1(self): s = u'<feed><item><title>text</title></item></feed>' r = feedparser._open_resource(s, '', '', '', '', [], {}) self.assertEqual(s.encode('utf-8'), r.read())
def test_bytes(self): s = '<feed><item><title>text</title></item></feed>'.encode('utf-8') r = feedparser._open_resource(s, '', '', '', '', [], {}) self.assertEqual(s, r.read())
def test_unicode_2(self): s = u"<feed><item><title>t\u00e9xt</title></item></feed>" r = feedparser._open_resource(s, "", "", "", "", [], {}) self.assertEqual(s.encode("utf-8"), r.read())
def test_string(self): s = "<feed><item><title>text</title></item></feed>" r = feedparser._open_resource(s, "", "", "", "", [], {}) self.assertEqual(s.encode("utf-8"), r.read())
def run(num=None): feeds, feedfileObject = load() mailserver = None try: # We store the default to address as the first item in the feeds list. # Here we take it out and save it for later. default_to = "" if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] else: ifeeds = feeds if num: ifeeds = [feeds[num]] feednum = 0 for f in ifeeds: try: feednum += 1 if not f.active: continue if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url) r = {} try: r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified) except TimeoutError: print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url) continue # Handle various status conditions, as required if 'status' in r: if r.status == 301: f.url = r['url'] elif r.status == 410: print >>warn, "W: feed gone; deleting", f.url feeds.remove(f) continue http_status = r.get('status', 200) if VERBOSE > 1: print >>warn, "I: http status", http_status http_headers = r.get('headers', { 'content-type': 'application/rss+xml', 'content-length':'1'}) exc_type = r.get("bozo_exception", Exception()).__class__ if http_status != 304 and not r.entries and not r.get('version', ''): if http_status not in [200, 302]: print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url) elif contains(http_headers.get('content-type', 'rss'), 'html'): print >>warn, "W: looks like HTML [%d] %s" % (feednum, f.url) elif http_headers.get('content-length', '1') == '0': print >>warn, "W: empty page [%d] %s" % (feednum, f.url) elif hasattr(socket, 'timeout') and exc_type == socket.timeout: print >>warn, "W: timed out on [%d] %s" % (feednum, f.url) elif exc_type == IOError: print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url) elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error: print >>warn, "W: broken compression [%d] %s" % (feednum, f.url) elif exc_type in socket_errors: exc_reason = r.bozo_exception.args[1] print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) elif exc_type == urllib2.URLError: if r.bozo_exception.reason.__class__ in socket_errors: exc_reason = r.bozo_exception.reason.args[1] else: exc_reason = r.bozo_exception.reason print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) elif exc_type == AttributeError: print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url) elif exc_type == KeyboardInterrupt: raise r.bozo_exception elif r.bozo: print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process")) else: print >>warn, "=== rss2email encountered a problem with this feed ===" print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ===" print >>warn, "=== If this occurs repeatedly, send this to [email protected] ===" print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url print >>warn, r print >>warn, "rss2email", __version__ print >>warn, "feedparser", feedparser.__version__ print >>warn, "html2text", h2t.__version__ print >>warn, "Python", sys.version print >>warn, "=== END HERE ===" continue r.entries.reverse() for entry in r.entries: id = getID(entry) # If TRUST_GUID isn't set, we get back hashes of the content. # Instead of letting these run wild, we put them in context # by associating them with the actual ID (if it exists). frameid = entry.get('id') if not(frameid): frameid = id if type(frameid) is DictType: frameid = frameid.values()[0] # If this item's ID is in our database # then it's already been sent # and we don't need to do anything more. if frameid in f.seen: if f.seen[frameid] == id: continue if not (f.to or default_to): print "No default email address defined. Please run 'r2e email emailaddress'" print "Ignoring feed %s" % f.url break if 'title_detail' in entry and entry.title_detail: title = entry.title_detail.value if contains(entry.title_detail.type, 'html'): title = html2text(title) else: title = getContent(entry)[:70] title = title.replace("\n", " ").strip() datetime = time.gmtime() if DATE_HEADER: for datetype in DATE_HEADER_ORDER: kind = datetype+"_parsed" if kind in entry and entry[kind]: datetime = entry[kind] link = entry.get('link', "") from_addr = getEmail(r, entry) name = h2t.unescape(getName(r, entry)) fromhdr = formataddr((name, from_addr,)) tohdr = (f.to or default_to) subjecthdr = title datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime) useragenthdr = "rss2email" # Add post tags, if available tagline = "" if 'tags' in entry: tags = entry.get('tags') taglist = [] if tags: for tag in tags: taglist.append(tag['term']) if taglist: tagline = ",".join(taglist) extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'Message-ID': '<%s>' % hashlib.sha1(id.encode('utf-8')).hexdigest(), 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline, 'X-MUNGED-FROM': getMungedFrom(r), 'References': ''} if BONUS_HEADER != '': for hdr in BONUS_HEADER.strip().splitlines(): pos = hdr.strip().find(':') if pos > 0: extraheaders[hdr[:pos]] = hdr[pos+1:].strip() else: print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER entrycontent = getContent(entry, HTMLOK=HTML_MAIL) contenttype = 'plain' content = '' if THREAD_ON_TAGS and len(tagline): extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(t.strip().encode('utf-8')).hexdigest() for t in tagline.split(',')]) if USE_CSS_STYLING and HTML_MAIL: contenttype = 'html' content = "<html>\n" content += '<head><meta http-equiv="Content-Type" content="text/html"><style>' + STYLE_SHEET + '</style></head>\n' content += '<body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;">\n' content += '<div id="entry">\n' content += '<h1 class="header"' content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n' if ishtml(entrycontent): body = entrycontent[1].strip() if SUMMARIZE: content += '<div class="summary">%s</div>' % (summarize(html2text(body, plaintext=True), SUMMARIZE) + "<hr>") else: body = entrycontent.strip() if SUMMARIZE: content += '<div class="summary">%s</div>' % (summarize(body, SUMMARIZE) + "<hr>") if THREAD_ON_LINKS: parser = Parser() parser.feed(body) extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(h.strip().encode('utf-8')).hexdigest() for h in parser.attrs]) if INLINE_IMAGES_DATA_URI: parser = Parser(tag='img', attr='src') parser.feed(body) for src in parser.attrs: try: img = feedparser._open_resource(src, None, None, feedparser.USER_AGENT, link, [], {}) data = img.read() if hasattr(img, 'headers'): headers = dict((k.lower(), v) for k, v in dict(img.headers).items()) ctype = headers.get('content-type', None) if ctype and INLINE_IMAGES_DATA_URI: body = body.replace(src,'data:%s;base64,%s' % (ctype, base64.b64encode(data))) except: print >>warn, "Could not load image: %s" % src pass if body != '': content += '<div id="body">\n' + body + '</div>\n' content += '\n<p class="footer">URL: <a href="'+link+'">'+link+'</a>' if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if (hasattr(enclosure, 'url') and enclosure.url != ""): content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n") if (hasattr(enclosure, 'src') and enclosure.src != ""): content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n') if 'links' in entry: for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': extraurl = extralink['href'] extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/') viatitle = extraurl if ('title' in extralink): viatitle = extralink['title'] content += '<br/>Via: <a href="'+extraurl+'">'+viatitle+'</a>\n' content += '</p></div>\n' content += "\n\n</body></html>" else: if ishtml(entrycontent): contenttype = 'html' content = "<html>\n" content = ("<html><body>\n\n" + '<h1><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n' + entrycontent[1].strip() + # drop type tag (HACK: bad abstraction) '<p>URL: <a href="'+link+'">'+link+'</a></p>' ) if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if enclosure.url != "": content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n") if 'links' in entry: for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n' content += ("\n</body></html>") else: content = entrycontent.strip() + "\n\nURL: "+link if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if enclosure.url != "": content += ('\nEnclosure: ' + enclosure.url + "\n") if 'links' in entry: for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n' mailserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, datetime, extraheaders, mailserver, f.folder) f.seen[frameid] = id f.etag, f.modified = r.get('etag', None), r.get('modified', None) except (KeyboardInterrupt, SystemExit): raise except: print >>warn, "=== rss2email encountered a problem with this feed ===" print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ===" print >>warn, "=== If this occurs repeatedly, send this to [email protected] ===" print >>warn, "E: could not parse", f.url traceback.print_exc(file=warn) print >>warn, "rss2email", __version__ print >>warn, "feedparser", feedparser.__version__ print >>warn, "html2text", h2t.__version__ print >>warn, "Python", sys.version print >>warn, "=== END HERE ===" continue finally: unlock(feeds, feedfileObject) if mailserver: if IMAP_MARK_AS_READ: for folder in IMAP_MARK_AS_READ: mailserver.select(folder) res, data = mailserver.search(None, '(UNSEEN UNFLAGGED)') if res == 'OK': items = data[0].split() for i in items: res, data = mailserver.fetch(i, "(UID)") if data[0]: u = uid(data[0]) res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Seen)') if IMAP_MOVE_READ_TO: typ, data = mailserver.list(pattern='*') # Parse folder listing as a CSV dialect (automatically removes quotes) reader = csv.reader(StringIO.StringIO('\n'.join(data)),dialect='mailboxlist') # Iterate over each folder for row in reader: folder = row[-1:][0] if folder == IMAP_MOVE_READ_TO or '\Noselect' in row[0]: continue mailserver.select(folder) res, data = mailserver.search(None, '(SEEN UNFLAGGED)') if res == 'OK': items = data[0].split() for i in items: res, data = mailserver.fetch(i, "(UID)") if data[0]: u = uid(data[0]) res, data = mailserver.uid('COPY', u, IMAP_MOVE_READ_TO) if res == 'OK': res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Deleted)') mailserver.expunge() try: mailserver.quit() except: mailserver.logout()
if cache: if not nsurl.isFileURL(): nsurl = cache_url(nsurl, fileext) url = NSURL2str(nsurl) if 0: # does not work with file urls r = requests.get(url) s = r.content headers = r.headers r.close() else: # fob = feedparser._open_resource(url, None, None, CactusVersion.user_agent, None, [], {}) fob = feedparser._open_resource(url, None, None, None, None, [], {}) s = fob.read() fob.close() # check for gzip compressed opml file # pdb.set_trace() try: if len(s) > 2: if ord(s[0]) == 0x1f: if ord(s[1]) == 0x8b: unzipped = gzip.GzipFile( fileobj=StringIO.StringIO(s)).read() s = unzipped except Exception: pass