def parse_profile(self, profile, consts): if profile['online']: self.status = Contact.STATUS_ONLINE self.status_msg = u'online' self.status_msg = u'since %s' % profile['last_cnx'] else: self.status = Contact.STATUS_OFFLINE self.status_msg = u'last connection %s' % profile['last_cnx'] self.summary = unicode(unescape(profile.get('announce', '').strip())) if len(profile.get('shopping_list', '')) > 0: self.summary += u'\n\nLooking for:\n%s' % unescape( profile['shopping_list'].strip()) for photo in profile['pics']: self.set_photo(photo.split('/')[-1], url=photo + '/full', thumbnail_url=photo + '/small', hidden=False) self.profile = OrderedDict() if 'sex' in profile: for section, d in self.TABLE.items(): flags = ProfileNode.SECTION if section.startswith('_'): flags |= ProfileNode.HEAD if (section.startswith('+') and int(profile['sex']) != 1) or \ (section.startswith('-') and int(profile['sex']) != 0): continue section = section.lstrip('_+-') s = ProfileNode(section, section.capitalize(), OrderedDict(), flags=flags) for key, builder in d.items(): try: value = builder.get_value(profile, consts[int(profile['sex'])]) except KeyError: pass else: s.value[key] = ProfileNode( key, key.capitalize().replace('_', ' '), value) self.profile[section] = s self._aum_profile = profile
def test_unescape(self): self.assertEqual( '<pre>and then<div> & other tags', html2text.unescape( '<pre>and then<div> & other tags' ) )
def decode(self, content): content = self._replace(content, self.replace_html) # < and > that are still present need to be distinguishable from actual entities that get decoded to < and > content = re.sub('(<|>)', r'_!!\1', content) content = html2text.unescape(content) content = content.replace(' _place_holder;', ' ') return content
def parse_profile(self, profile, consts): if profile['online']: self.status = Contact.STATUS_ONLINE self.status_msg = u'online' self.status_msg = u'since %s' % profile['last_cnx'] else: self.status = Contact.STATUS_OFFLINE self.status_msg = u'last connection %s' % profile['last_cnx'] self.summary = unicode(unescape(profile.get('announce', '').strip())) if len(profile.get('shopping_list', '')) > 0: self.summary += u'\n\nLooking for:\n%s' % unescape(profile['shopping_list'].strip()) for photo in profile['pics']: self.set_photo(photo.split('/')[-1], url=photo + '/full', thumbnail_url=photo + '/small', hidden=False) self.profile = OrderedDict() if 'sex' in profile: for section, d in self.TABLE.iteritems(): flags = ProfileNode.SECTION if section.startswith('_'): flags |= ProfileNode.HEAD if (section.startswith('+') and int(profile['sex']) != 1) or \ (section.startswith('-') and int(profile['sex']) != 0): continue section = section.lstrip('_+-') s = ProfileNode(section, section.capitalize(), OrderedDict(), flags=flags) for key, builder in d.iteritems(): try: value = builder.get_value(profile, consts[int(profile['sex'])]) except KeyError: pass else: s.value[key] = ProfileNode(key, key.capitalize().replace('_', ' '), value) self.profile[section] = s self._aum_profile = profile
def unescape(value): """ Convert HTML entities """ try: import html2text as h2t return h2t.unescape(value, unicode_snob=True).replace("--", "-").replace( ' _place_holder;', ' ') except ImportError: return value
def _get_entry_name(self, parsed, entry): """Get the best name >>> import feedparser >>> f = Feed(name='test-feed') >>> parsed = feedparser.parse( ... '<feed xmlns="http://www.w3.org/2005/Atom">\\n' ... ' <entry>\\n' ... ' <author>\\n' ... ' <name>Example author</name>\\n' ... ' <email>[email protected]</email>\\n' ... ' <url>http://example.com/</url>\\n' ... ' </author>\\n' ... ' </entry>\\n' ... '</feed>\\n' ... ) >>> entry = parsed.entries[0] >>> f.name_format = '' >>> f._get_entry_name(parsed, entry) '' >>> f.name_format = '{author}' >>> f._get_entry_name(parsed, entry) 'Example author' >>> f.name_format = '{feed-title}: {author}' >>> f._get_entry_name(parsed, entry) ': Example author' >>> f.name_format = '{author} ({feed.name})' >>> f._get_entry_name(parsed, entry) 'Example author (test-feed)' """ if not self.name_format: return '' data = { 'feed': self, 'feed-name': self.name, 'feed-url': self.url, 'feed-title': '<feed title>', 'author': '<author>', 'publisher': '<publisher>', } feed = parsed.feed data['feed-title'] = feed.get('title', '') for x in [entry, feed]: if 'name' in x.get('author_detail', []): if x.author_detail.name: data['author'] = x.author_detail.name break if 'name' in feed.get('publisher_detail', []): data['publisher'] = feed.publisher_detail.name name = self.name_format.format(**data) return _html2text.unescape(name)
def get_thread(self, thread): if not isinstance(thread, Thread): thread = Thread(thread) thread.flags = Thread.IS_DISCUSSION messages = self.browser.get_thread_messages(thread.id) contact = self.storage.get('s***s', thread.id, default={'lastmsg': datetime(1970, 1, 1)}) thread.title = u'Discussion with %s' % messages['fields']['username'] me = OkcContact(self.browser.get_profile(self.browser.me['userid'])) other = OkcContact(self.browser.get_profile(thread.id)) parent = None for message in messages['messages']['messages']: date = datetime.fromtimestamp(message['timestamp']) flags = 0 if contact['lastmsg'] < date: flags = Message.IS_UNREAD if message['from'] == thread.id: sender = other receiver = me else: receiver = other sender = me msg = Message(thread=thread, id=message['id'], title=thread.title, sender=sender.name, receivers=[receiver.name], date=date, content=to_unicode(unescape(message['body'])), children=[], parent=parent, signature=sender.get_text(), flags=flags) if parent: parent.children = [msg] else: thread.root = msg parent = msg return thread
def _get_entry_name(self, parsed, entry): """Get the best name >>> import feedparser >>> f = Feed(name='test-feed') >>> parsed = feedparser.parse( ... '<feed xmlns="http://www.w3.org/2005/Atom">\\n' ... ' <entry>\\n' ... ' <author>\\n' ... ' <name>Example author</name>\\n' ... ' <email>[email protected]</email>\\n' ... ' <url>http://example.com/</url>\\n' ... ' </author>\\n' ... ' </entry>\\n' ... '</feed>\\n' ... ) >>> entry = parsed.entries[0] >>> f.name_format = '' >>> f._get_entry_name(parsed, entry) '' >>> f.name_format = '{author}' >>> f._get_entry_name(parsed, entry) 'Example author' >>> f.name_format = '{feed-title}: {author}' >>> f._get_entry_name(parsed, entry) ': Example author' >>> f.name_format = '{author} ({feed.name})' >>> f._get_entry_name(parsed, entry) 'Example author (test-feed)' """ if not self.name_format: return '' data = { 'feed': self, 'feed-title': '<feed title>', 'author': '<author>', 'publisher': '<publisher>', } feed = parsed.feed data['feed-title'] = feed.get('title', '') for x in [entry, feed]: if 'name' in x.get('author_detail', []): if x.author_detail.name: data['author'] = x.author_detail.name break if 'name' in feed.get('publisher_detail', []): data['publisher'] = feed.publisher_detail.name name = self.name_format.format(**data) return _html2text.unescape(name)
def search_problems(cn): _, t = cn.split('id="cnblogs_post_body"', 1) t = t[t.find('>') + 1:] ls = [] chunks = t.split('\n') for chunk in chunks: if '。' in chunk or ',' in chunk: break ls.append(chunk) html = ''.join(ls) desc = html2text.unescape(html2text.html2text(html)) desc = re.sub(r' +\n+', '\n', desc) desc = desc.strip() return desc
def get_thread(self, thread): if not isinstance(thread, Thread): thread = Thread(thread) thread.flags = Thread.IS_DISCUSSION messages = self.browser.get_thread_messages(thread.id) contact = self.storage.get('s***s', thread.id, default={'lastmsg': datetime(1970,1,1)}) thread.title = u'Discussion with %s' % messages['fields']['username'] me = OkcContact(self.browser.get_profile(self.browser.me['userid'])) other = OkcContact(self.browser.get_profile(thread.id)) parent = None for message in messages['messages']['messages']: date = datetime.fromtimestamp(message['timestamp']) flags = 0 if contact['lastmsg'] < date: flags = Message.IS_UNREAD if message['from'] == thread.id: sender = other receiver = me else: receiver = other sender = me msg = Message(thread=thread, id=message['id'], title=thread.title, sender=sender.name, receivers=[receiver.name], date=date, content=to_unicode(unescape(message['body'])), children=[], parent=parent, signature=sender.get_text(), flags=flags) if parent: parent.children = [msg] else: thread.root = msg parent = msg return thread
def get_torrent(self, id): url = NotAvailable magnet = NotAvailable for div in self.document.getiterator('div'): if div.attrib.get('id', '') == 'title': title = unicode(unescape(div.text.strip())) elif div.attrib.get('class', '') == 'download': for link in self.parser.select(div, 'a'): href = link.attrib.get('href', '') # https fails on the download server, so strip it if href.startswith('https://'): href = href.replace('https://', 'http://', 1) if href.startswith('magnet:'): magnet = unicode(href) elif len(href): url = unicode(href) elif div.attrib.get('id', '') == 'details': size = float(div.getchildren()[0].getchildren()[5].text.split( '(')[1].split('Bytes')[0]) if len(div.getchildren()) > 1 \ and div.getchildren()[1].attrib.get('class', '') == 'col2': child_to_explore = div.getchildren()[1] else: child_to_explore = div.getchildren()[0] prev_child_txt = "none" seed = "-1" leech = "-1" for ch in child_to_explore.getchildren(): if prev_child_txt == "Seeders:": seed = ch.text if prev_child_txt == "Leechers:": leech = ch.text prev_child_txt = ch.text elif div.attrib.get('class', '') == 'nfo': description = unicode( div.getchildren()[0].text_content().strip()) torrent = Torrent(id, title) torrent.url = url or NotAvailable torrent.magnet = magnet torrent.size = size torrent.seeders = int(seed) torrent.leechers = int(leech) torrent.description = description torrent.files = NotAvailable return torrent
def get_torrent(self, id): url = NotAvailable magnet = NotAvailable for div in self.document.getiterator('div'): if div.attrib.get('id', '') == 'title': title = unicode(unescape(div.text.strip())) elif div.attrib.get('class', '') == 'download': for link in self.parser.select(div, 'a'): href = link.attrib.get('href', '') # https fails on the download server, so strip it if href.startswith('https://'): href = href.replace('https://', 'http://', 1) if href.startswith('magnet:'): magnet = unicode(href) elif len(href): url = unicode(href) elif div.attrib.get('id', '') == 'details': size = float(div.getchildren()[0].getchildren()[5].text.split('(')[1].split('Bytes')[0]) if len(div.getchildren()) > 1 \ and div.getchildren()[1].attrib.get('class', '') == 'col2': child_to_explore = div.getchildren()[1] else: child_to_explore = div.getchildren()[0] prev_child_txt = "none" seed = "-1" leech = "-1" for ch in child_to_explore.getchildren(): if prev_child_txt == "Seeders:": seed = ch.text if prev_child_txt == "Leechers:": leech = ch.text prev_child_txt = ch.text elif div.attrib.get('class', '') == 'nfo': description = unicode(div.getchildren()[0].text_content().strip()) torrent = Torrent(id, title) torrent.url = url or NotAvailable torrent.magnet = magnet torrent.size = size torrent.seeders = int(seed) torrent.leechers = int(leech) torrent.description = description torrent.files = NotAvailable return torrent
def _get_entry_name(self, parsed, entry): """Get the best name >>> import feedparser >>> f = Feed(name='test-feed') >>> parsed = feedparser.parse( ... '<feed xmlns="http://www.w3.org/2005/Atom">\\n' ... ' <entry>\\n' ... ' <author>\\n' ... ' <name>Example author</name>\\n' ... ' <email>[email protected]</email>\\n' ... ' <url>http://example.com/</url>\\n' ... ' </author>\\n' ... ' </entry>\\n' ... '</feed>\\n' ... ) >>> entry = parsed.entries[0] >>> f.friendly_name = False >>> f._get_entry_name(parsed, entry) '' >>> f.friendly_name = True >>> f._get_entry_name(parsed, entry) 'Example author' """ if not self.friendly_name: return '' parts = [''] feed = parsed.feed parts.append(feed.get('title', '')) for x in [entry, feed]: if 'name' in x.get('author_detail', []): if x.author_detail.name: if ''.join(parts): parts.append(': ') parts.append(x.author_detail.name) break if not ''.join(parts) and self.use_publisher_email: if 'name' in feed.get('publisher_detail', []): if ''.join(parts): parts.append(': ') parts.append(feed.publisher_detail.name) return _html2text.unescape(''.join(parts))
def iter_torrents(self): try: table = self.parser.select(self.document.getroot(), 'table#searchResult', 1) except BrokenPageError: return first = True for tr in table.getiterator('tr'): if first: first = False continue if tr.get('class', '') != "header": td = tr.getchildren()[1] div = td.getchildren()[0] link = div.find('a').attrib['href'] title = unicode(unescape(div.find('a').text)) idt = link.split('/')[2] a = td.getchildren()[1] url = unicode(a.attrib['href']) size = td.find('font').text.split(',')[1].strip() u = size.split(' ')[1].split(u'\xa0')[1].replace('i', '') size = size.split(' ')[1].split(u'\xa0')[0] seed = tr.getchildren()[2].text leech = tr.getchildren()[3].text torrent = Torrent(idt, title) torrent.url = url torrent.size = self.unit(float(size), u) torrent.seeders = int(seed) torrent.leechers = int(leech) torrent.description = NotLoaded torrent.files = NotLoaded torrent.magnet = NotLoaded yield torrent
str).str.replace('_fr', '') wc_translation_data = wc_en_data.merge( wc_fr_data, left_on=['SKU'], right_on=['original_sku']) wc_translation_data # %% wc_title_translations = wc_translation_data[['Name_x', 'Name_y']] wc_title_translations.columns = ['source', 'target'] wc_description_translations = wc_translation_data[[ 'Description_x', 'Description_y']].dropna(axis=0) content_translation = pd.DataFrame() content_translation['source'] = '<p>' + \ wc_description_translations['Description_x']+'</p>' content_translation['target'] = '<p>' + \ wc_description_translations['Description_y']+'</p>' wc_description_translations.columns = ['source', 'target'] shopify_langify_import = pd.concat( [wc_title_translations, content_translation], ignore_index=True) shopify_langify_import.drop_duplicates(inplace=True) shopify_langify_import['source'] = shopify_langify_import['source'].apply( lambda x: unescape(x).replace(' ',' ')) shopify_langify_import['target'] = shopify_langify_import['target'].apply( lambda x: unescape(x).replace(' ',' ')) # %% WC_TRANSLATION_CSV = 'translation/shopify-import-langify-aw.csv' shopify_langify_import.to_csv(WC_TRANSLATION_CSV, mode='w+', index=False) # %%
$NetBSD: patch-rss2email_feed.py,v 1.3 2019/08/28 19:44:48 schmonz Exp $ html2text no longer provides unescape(). Python 3.4's html module does. <https://github.com/rss2email/rss2email/commit/81824e25723dcd2936f25f64ebc16f2e8ec9f310> --- rss2email/feed.py.orig 2014-09-01 23:21:01.000000000 +0000 +++ rss2email/feed.py @@ -48,6 +48,7 @@ import xml.sax.saxutils as _saxutils import feedparser as _feedparser import html2text as _html2text +import html as _html from . import __url__ from . import __version__ @@ -595,7 +596,7 @@ class Feed (object): if 'name' in feed.get('publisher_detail', []): data['publisher'] = feed.publisher_detail.name name = self.name_format.format(**data) - return _html2text.unescape(name) + return _html.unescape(name) def _validate_email(self, email, default=None): """Do a basic quality check on email address
def do_login(self): """ Attempt to log in. Note: this method does nothing if we are already logged in. """ self.BASEURL = 'https://%s/' % self.first_domain self._sag = None if not self.home_page.is_here(): self.home_page.go() if self.new_login: self.page.go_to_auth() parsed = urlparse(self.url) self.BASEURL = '%s://%s' % (parsed.scheme, parsed.netloc) else: # On the homepage, we get the URL of the auth service. url = self.page.get_post_url() if url is None: raise WebsiteNotSupported() # First, post account number to get the password prompt. data = { 'CCPTE': self.username[:11].encode('iso8859-15'), 'canal': 'WEB', 'hauteur_ecran': 768, 'largeur_ecran': 1024, 'liberror': '', 'matrice': 'true', 'origine': 'vitrine', 'situationTravail': 'BANCAIRE', 'typeAuthentification': 'CLIC_ALLER', 'urlOrigine': self.page.url, 'vitrine': 0, } parsed = urlparse(url) self.BASEURL = '%s://%s' % (parsed.scheme, parsed.netloc) self.location(url, data=data) assert self.login_page.is_here() # Then, post the password. self.page.login(self.username, self.password) if self.new_login: url = self.page.get_accounts_url() else: # The result of POST is the destination URL. url = self.page.get_result_url() if not url.startswith('http'): raise BrowserIncorrectPassword(unescape(url, unicode_snob=True)) self.location(url.replace('Synthese', 'Synthcomptes')) if self.login_error.is_here(): raise BrowserIncorrectPassword() if self.page is None: raise WebsiteNotSupported() if not self.accounts.is_here(): # Sometimes the home page is Releves. new_url = re.sub('act=([^&=]+)', 'act=Synthcomptes', self.page.url, 1) self.location(new_url) if not self.accounts.is_here(): raise BrowserIncorrectPassword() if self.code_caisse is None: self.code_caisse = self.page.get_code_caisse() # Store the current url to go back when requesting accounts list. self.accounts_url = re.sub('sessionSAG=[^&]+', 'sessionSAG={0}', self.page.url) # we can deduce the URL to "savings" and "loan" accounts from the regular accounts one self.savings_url = re.sub('act=([^&=]+)', 'act=Synthepargnes', self.accounts_url, 1) self.loans_url = re.sub('act=([^&=]+)', 'act=Synthcredits', self.accounts_url, 1) self.advisor_url = re.sub('act=([^&=]+)', 'act=Contact', self.accounts_url, 1) self.profile_url = re.sub('act=([^&=]+)', 'act=Coordonnees', self.accounts_url, 1) if self.page.check_perimeters() and not self.broken_perimeters: self.perimeter_url = re.sub('act=([^&=]+)', 'act=Perimetre', self.accounts_url, 1) self.chg_perimeter_url = '%s%s' % (re.sub( 'act=([^&=]+)', 'act=ChgPerim', self.accounts_url, 1), '&typeaction=ChgPerim') self.location(self.perimeter_url.format(self.sag)) self.page.check_multiple_perimeters()
def run(num=None): feeds, feedfileObject = load() smtpserver = None try: # We store the default to address as the first item in the feeds list. # Here we take it out and save it for later. default_to = "" if feeds and isstr(feeds[0]): default_to = feeds[0] ifeeds = feeds[1:] else: ifeeds = feeds if num: ifeeds = [feeds[num]] feednum = 0 for f in ifeeds: try: feednum += 1 if not f.active: continue if VERBOSE: print >> warn, 'I: Processing [%d] "%s"' % (feednum, f.url) r = {} try: r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified) except TimeoutError: print >> warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url) continue # Handle various status conditions, as required if 'status' in r: if r.status == 301: f.url = r['url'] elif r.status == 410: print >> warn, "W: feed gone; deleting", f.url feeds.remove(f) continue http_status = r.get('status', 200) if VERBOSE > 1: print >> warn, "I: http status", http_status http_headers = r.get('headers', { 'content-type': 'application/rss+xml', 'content-length': '1' }) exc_type = r.get("bozo_exception", Exception()).__class__ if http_status != 304 and not r.entries and not r.get( 'version', ''): if http_status not in [200, 302]: print >> warn, "W: error %d [%d] %s" % (http_status, feednum, f.url) elif contains(http_headers.get('content-type', 'rss'), 'html'): print >> warn, "W: looks like HTML [%d] %s" % (feednum, f.url) elif http_headers.get('content-length', '1') == '0': print >> warn, "W: empty page [%d] %s" % (feednum, f.url) elif hasattr(socket, 'timeout') and exc_type == socket.timeout: print >> warn, "W: timed out on [%d] %s" % (feednum, f.url) elif exc_type == IOError: print >> warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url) elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error: print >> warn, "W: broken compression [%d] %s" % ( feednum, f.url) elif exc_type in socket_errors: exc_reason = r.bozo_exception.args[1] print >> warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) elif exc_type == urllib2.URLError: if r.bozo_exception.reason.__class__ in socket_errors: exc_reason = r.bozo_exception.reason.args[1] else: exc_reason = r.bozo_exception.reason print >> warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) elif exc_type == AttributeError: print >> warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url) elif exc_type == KeyboardInterrupt: raise r.bozo_exception elif r.bozo: print >> warn, 'E: error in [%d] "%s" feed (%s)' % ( feednum, f.url, r.get("bozo_exception", "can't process")) else: print >> warn, "=== rss2email encountered a problem with this feed ===" print >> warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ===" print >> warn, "=== If this occurs repeatedly, send this to [email protected] ===" print >> warn, "E:", r.get("bozo_exception", "can't process"), f.url print >> warn, r print >> warn, "rss2email", __version__ print >> warn, "feedparser", feedparser.__version__ print >> warn, "html2text", h2t.__version__ print >> warn, "Python", sys.version print >> warn, "=== END HERE ===" continue r.entries.reverse() for entry in r.entries: id = getID(entry) # If TRUST_GUID isn't set, we get back hashes of the content. # Instead of letting these run wild, we put them in context # by associating them with the actual ID (if it exists). frameid = entry.get('id') if not (frameid): frameid = id if type(frameid) is DictType: frameid = frameid.values()[0] # If this item's ID is in our database # then it's already been sent # and we don't need to do anything more. if frameid in f.seen: if f.seen[frameid] == id: continue if not (f.to or default_to): print "No default email address defined. Please run 'r2e email emailaddress'" print "Ignoring feed %s" % f.url break if 'title_detail' in entry and entry.title_detail: title = entry.title_detail.value if contains(entry.title_detail.type, 'html'): title = html2text(title) else: title = getContent(entry)[:70] title = title.replace("\n", " ").strip() datetime = time.gmtime() if DATE_HEADER: for datetype in DATE_HEADER_ORDER: kind = datetype + "_parsed" if kind in entry and entry[kind]: datetime = entry[kind] link = entry.get('link', "") from_addr = getEmail(r, entry) name = h2t.unescape(getName(r, entry)) fromhdr = formataddr(( name, from_addr, )) tohdr = (f.to or default_to) subjecthdr = title datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime) useragenthdr = "rss2email" # Add post tags, if available tagline = "" if 'tags' in entry: tags = entry.get('tags') taglist = [] if tags: for tag in tags: taglist.append(tag['term']) if taglist: tagline = ",".join(taglist) extraheaders = { 'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS': tagline } if BONUS_HEADER != '': for hdr in BONUS_HEADER.strip().splitlines(): pos = hdr.strip().find(':') if pos > 0: extraheaders[hdr[:pos]] = hdr[pos + 1:].strip() else: print >> warn, "W: malformed BONUS HEADER", BONUS_HEADER entrycontent = getContent(entry, HTMLOK=HTML_MAIL) contenttype = 'plain' content = '' if USE_CSS_STYLING and HTML_MAIL: contenttype = 'html' content = "<html>\n" content += '<head><style><!--' + STYLE_SHEET + '//--></style></head>\n' content += '<body>\n' content += '<div id="entry">\n' content += '<h1' content += ' class="header"' content += '><a href="' + link + '">' + subjecthdr + '</a></h1>\n' if ishtml(entrycontent): body = entrycontent[1].strip() else: body = entrycontent.strip() if body != '': content += '<div id="body"><table><tr><td>\n' + body + '</td></tr></table></div>\n' content += '\n<p class="footer">URL: <a href="' + link + '">' + link + '</a>' if hasattr(entry, 'enclosures'): for enclosure in entry.enclosures: if (hasattr(enclosure, 'url') and enclosure.url != ""): content += ('<br/>Enclosure: <a href="' + enclosure.url + '">' + enclosure.url + "</a>\n") if (hasattr(enclosure, 'src') and enclosure.src != ""): content += ('<br/>Enclosure: <a href="' + enclosure.src + '">' + enclosure.src + '</a><br/><img src="' + enclosure.src + '"\n') if 'links' in entry: for extralink in entry.links: if ('rel' in extralink ) and extralink['rel'] == u'via': extraurl = extralink['href'] extraurl = extraurl.replace( 'http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/') viatitle = extraurl if ('title' in extralink): viatitle = extralink['title'] content += '<br/>Via: <a href="' + extraurl + '">' + viatitle + '</a>\n' content += '</p></div>\n' content += "\n\n</body></html>" else: if ishtml(entrycontent): contenttype = 'html' content = "<html>\n" content = ( "<html><body>\n\n" + '<h1><a href="' + link + '">' + subjecthdr + '</a></h1>\n\n' + entrycontent[1].strip() + # drop type tag (HACK: bad abstraction) '<p>URL: <a href="' + link + '">' + link + '</a></p>') if hasattr(entry, 'enclosures'): for enclosure in entry.enclosures: if enclosure.url != "": content += ('Enclosure: <a href="' + enclosure.url + '">' + enclosure.url + "</a><br/>\n") if 'links' in entry: for extralink in entry.links: if ('rel' in extralink ) and extralink['rel'] == u'via': content += 'Via: <a href="' + extralink[ 'href'] + '">' + extralink[ 'title'] + '</a><br/>\n' content += ("\n</body></html>") else: content = entrycontent.strip() + "\n\nURL: " + link if hasattr(entry, 'enclosures'): for enclosure in entry.enclosures: if enclosure.url != "": content += ('\nEnclosure: ' + enclosure.url + "\n") if 'links' in entry: for extralink in entry.links: if ('rel' in extralink ) and extralink['rel'] == u'via': content += '<a href="' + extralink[ 'href'] + '">Via: ' + extralink[ 'title'] + '</a>\n' smtpserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, extraheaders, smtpserver) f.seen[frameid] = id f.etag, f.modified = r.get('etag', None), r.get('modified', None) except (KeyboardInterrupt, SystemExit): raise except: print >> warn, "=== rss2email encountered a problem with this feed ===" print >> warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ===" print >> warn, "=== If this occurs repeatedly, send this to [email protected] ===" print >> warn, "E: could not parse", f.url traceback.print_exc(file=warn) print >> warn, "rss2email", __version__ print >> warn, "feedparser", feedparser.__version__ print >> warn, "html2text", h2t.__version__ print >> warn, "Python", sys.version print >> warn, "=== END HERE ===" continue finally: unlock(feeds, feedfileObject) if smtpserver: smtpserver.quit()
def run(num=None): feeds, feedfileObject = load() mailserver = None try: # We store the default to address as the first item in the feeds list. # Here we take it out and save it for later. default_to = "" if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] else: ifeeds = feeds if num: ifeeds = [feeds[num]] feednum = 0 for f in ifeeds: try: feednum += 1 if not f.active: continue if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url) r = {} try: r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified) except TimeoutError: print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url) continue # Handle various status conditions, as required if 'status' in r: if r.status == 301: f.url = r['url'] elif r.status == 410: print >>warn, "W: feed gone; deleting", f.url feeds.remove(f) continue http_status = r.get('status', 200) if VERBOSE > 1: print >>warn, "I: http status", http_status http_headers = r.get('headers', { 'content-type': 'application/rss+xml', 'content-length':'1'}) exc_type = r.get("bozo_exception", Exception()).__class__ if http_status != 304 and not r.entries and not r.get('version', ''): if http_status not in [200, 302]: print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url) elif contains(http_headers.get('content-type', 'rss'), 'html'): print >>warn, "W: looks like HTML [%d] %s" % (feednum, f.url) elif http_headers.get('content-length', '1') == '0': print >>warn, "W: empty page [%d] %s" % (feednum, f.url) elif hasattr(socket, 'timeout') and exc_type == socket.timeout: print >>warn, "W: timed out on [%d] %s" % (feednum, f.url) elif exc_type == IOError: print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url) elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error: print >>warn, "W: broken compression [%d] %s" % (feednum, f.url) elif exc_type in socket_errors: exc_reason = r.bozo_exception.args[1] print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) elif exc_type == urllib2.URLError: if r.bozo_exception.reason.__class__ in socket_errors: exc_reason = r.bozo_exception.reason.args[1] else: exc_reason = r.bozo_exception.reason print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) elif exc_type == AttributeError: print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url) elif exc_type == KeyboardInterrupt: raise r.bozo_exception elif r.bozo: print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process")) else: print >>warn, "=== rss2email encountered a problem with this feed ===" print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ===" print >>warn, "=== If this occurs repeatedly, send this to [email protected] ===" print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url print >>warn, r print >>warn, "rss2email", __version__ print >>warn, "feedparser", feedparser.__version__ print >>warn, "html2text", h2t.__version__ print >>warn, "Python", sys.version print >>warn, "=== END HERE ===" continue r.entries.reverse() for entry in r.entries: id = getID(entry) # If TRUST_GUID isn't set, we get back hashes of the content. # Instead of letting these run wild, we put them in context # by associating them with the actual ID (if it exists). frameid = entry.get('id') if not(frameid): frameid = id if type(frameid) is DictType: frameid = frameid.values()[0] # If this item's ID is in our database # then it's already been sent # and we don't need to do anything more. if frameid in f.seen: if f.seen[frameid] == id: continue if not (f.to or default_to): print "No default email address defined. Please run 'r2e email emailaddress'" print "Ignoring feed %s" % f.url break if 'title_detail' in entry and entry.title_detail: title = entry.title_detail.value if contains(entry.title_detail.type, 'html'): title = html2text(title) else: title = getContent(entry)[:70] title = title.replace("\n", " ").strip() datetime = time.gmtime() if DATE_HEADER: for datetype in DATE_HEADER_ORDER: kind = datetype+"_parsed" if kind in entry and entry[kind]: datetime = entry[kind] link = entry.get('link', "") from_addr = getEmail(r, entry) name = h2t.unescape(getName(r, entry)) fromhdr = formataddr((name, from_addr,)) tohdr = (f.to or default_to) subjecthdr = title datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime) useragenthdr = "rss2email" # Add post tags, if available tagline = "" if 'tags' in entry: tags = entry.get('tags') taglist = [] if tags: for tag in tags: taglist.append(tag['term']) if taglist: tagline = ",".join(taglist) extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'Message-ID': '<%s>' % hashlib.sha1(id.encode('utf-8')).hexdigest(), 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline, 'X-MUNGED-FROM': getMungedFrom(r), 'References': ''} if BONUS_HEADER != '': for hdr in BONUS_HEADER.strip().splitlines(): pos = hdr.strip().find(':') if pos > 0: extraheaders[hdr[:pos]] = hdr[pos+1:].strip() else: print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER entrycontent = getContent(entry, HTMLOK=HTML_MAIL) contenttype = 'plain' content = '' if THREAD_ON_TAGS and len(tagline): extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(t.strip().encode('utf-8')).hexdigest() for t in tagline.split(',')]) if USE_CSS_STYLING and HTML_MAIL: contenttype = 'html' content = "<html>\n" content += '<head><meta http-equiv="Content-Type" content="text/html"><style>' + STYLE_SHEET + '</style></head>\n' content += '<body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;">\n' content += '<div id="entry">\n' content += '<h1 class="header"' content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n' if ishtml(entrycontent): body = entrycontent[1].strip() if SUMMARIZE: content += '<div class="summary">%s</div>' % (summarize(html2text(body, plaintext=True), SUMMARIZE) + "<hr>") else: body = entrycontent.strip() if SUMMARIZE: content += '<div class="summary">%s</div>' % (summarize(body, SUMMARIZE) + "<hr>") if THREAD_ON_LINKS: parser = Parser() parser.feed(body) extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(h.strip().encode('utf-8')).hexdigest() for h in parser.attrs]) if INLINE_IMAGES_DATA_URI: parser = Parser(tag='img', attr='src') parser.feed(body) for src in parser.attrs: try: img = feedparser._open_resource(src, None, None, feedparser.USER_AGENT, link, [], {}) data = img.read() if hasattr(img, 'headers'): headers = dict((k.lower(), v) for k, v in dict(img.headers).items()) ctype = headers.get('content-type', None) if ctype and INLINE_IMAGES_DATA_URI: body = body.replace(src,'data:%s;base64,%s' % (ctype, base64.b64encode(data))) except: print >>warn, "Could not load image: %s" % src pass if body != '': content += '<div id="body">\n' + body + '</div>\n' content += '\n<p class="footer">URL: <a href="'+link+'">'+link+'</a>' if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if (hasattr(enclosure, 'url') and enclosure.url != ""): content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n") if (hasattr(enclosure, 'src') and enclosure.src != ""): content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n') if 'links' in entry: for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': extraurl = extralink['href'] extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/') viatitle = extraurl if ('title' in extralink): viatitle = extralink['title'] content += '<br/>Via: <a href="'+extraurl+'">'+viatitle+'</a>\n' content += '</p></div>\n' content += "\n\n</body></html>" else: if ishtml(entrycontent): contenttype = 'html' content = "<html>\n" content = ("<html><body>\n\n" + '<h1><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n' + entrycontent[1].strip() + # drop type tag (HACK: bad abstraction) '<p>URL: <a href="'+link+'">'+link+'</a></p>' ) if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if enclosure.url != "": content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n") if 'links' in entry: for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n' content += ("\n</body></html>") else: content = entrycontent.strip() + "\n\nURL: "+link if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if enclosure.url != "": content += ('\nEnclosure: ' + enclosure.url + "\n") if 'links' in entry: for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n' mailserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, datetime, extraheaders, mailserver, f.folder) f.seen[frameid] = id f.etag, f.modified = r.get('etag', None), r.get('modified', None) except (KeyboardInterrupt, SystemExit): raise except: print >>warn, "=== rss2email encountered a problem with this feed ===" print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ===" print >>warn, "=== If this occurs repeatedly, send this to [email protected] ===" print >>warn, "E: could not parse", f.url traceback.print_exc(file=warn) print >>warn, "rss2email", __version__ print >>warn, "feedparser", feedparser.__version__ print >>warn, "html2text", h2t.__version__ print >>warn, "Python", sys.version print >>warn, "=== END HERE ===" continue finally: unlock(feeds, feedfileObject) if mailserver: if IMAP_MARK_AS_READ: for folder in IMAP_MARK_AS_READ: mailserver.select(folder) res, data = mailserver.search(None, '(UNSEEN UNFLAGGED)') if res == 'OK': items = data[0].split() for i in items: res, data = mailserver.fetch(i, "(UID)") if data[0]: u = uid(data[0]) res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Seen)') if IMAP_MOVE_READ_TO: typ, data = mailserver.list(pattern='*') # Parse folder listing as a CSV dialect (automatically removes quotes) reader = csv.reader(StringIO.StringIO('\n'.join(data)),dialect='mailboxlist') # Iterate over each folder for row in reader: folder = row[-1:][0] if folder == IMAP_MOVE_READ_TO or '\Noselect' in row[0]: continue mailserver.select(folder) res, data = mailserver.search(None, '(SEEN UNFLAGGED)') if res == 'OK': items = data[0].split() for i in items: res, data = mailserver.fetch(i, "(UID)") if data[0]: u = uid(data[0]) res, data = mailserver.uid('COPY', u, IMAP_MOVE_READ_TO) if res == 'OK': res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Deleted)') mailserver.expunge() try: mailserver.quit() except: mailserver.logout()
def do_login(self): """ Attempt to log in. Note: this method does nothing if we are already logged in. """ self.BASEURL = 'https://%s/' % self.first_domain self._sag = None if not self.home_page.is_here(): self.home_page.go() if self.new_website.is_here(): self.logger.warning('This connection uses the new API website') raise SiteSwitch('api') if self.new_login: self.page.go_to_auth() parsed = urlparse(self.url) self.BASEURL = '%s://%s' % (parsed.scheme, parsed.netloc) else: # On the homepage, we get the URL of the auth service. url = self.page.get_post_url() if url is None: raise WebsiteNotSupported() # First, post account number to get the password prompt. data = {'CCPTE': self.username[:11].encode('iso8859-15'), 'canal': 'WEB', 'hauteur_ecran': 768, 'largeur_ecran': 1024, 'liberror': '', 'matrice': 'true', 'origine': 'vitrine', 'situationTravail': 'BANCAIRE', 'typeAuthentification': 'CLIC_ALLER', 'urlOrigine': self.page.url, 'vitrine': 0, } parsed = urlparse(url) self.BASEURL = '%s://%s' % (parsed.scheme, parsed.netloc) self.location(url, data=data) assert self.login_page.is_here() # Then, post the password. self.page.login(self.username, self.password) if self.new_login: url = self.page.get_accounts_url() else: # The result of POST is the destination URL. url = self.page.get_result_url() if not url.startswith('http'): raise BrowserIncorrectPassword(unescape(url, unicode_snob=True)) self.location(url.replace('Synthese', 'Synthcomptes')) if self.login_error.is_here(): raise BrowserIncorrectPassword() if self.page is None: raise WebsiteNotSupported() if not self.accounts.is_here(): # Sometimes the home page is Releves. new_url = re.sub('act=([^&=]+)', 'act=Synthcomptes', self.page.url, 1) self.location(new_url) if not self.accounts.is_here(): raise BrowserIncorrectPassword() if self.code_caisse is None: self.code_caisse = self.page.get_code_caisse() # Store the current url to go back when requesting accounts list. self.accounts_url = re.sub('sessionSAG=[^&]+', 'sessionSAG={0}', self.page.url) # we can deduce the URL to "savings" and "loan" accounts from the regular accounts one self.savings_url = re.sub('act=([^&=]+)', 'act=Synthepargnes', self.accounts_url, 1) self.loans_url = re.sub('act=([^&=]+)', 'act=Synthcredits', self.accounts_url, 1) self.advisor_url = re.sub('act=([^&=]+)', 'act=Contact', self.accounts_url, 1) self.profile_url = re.sub('act=([^&=]+)', 'act=Coordonnees', self.accounts_url, 1) if self.page.check_perimeters() and not self.broken_perimeters: self.perimeter_url = re.sub('act=([^&=]+)', 'act=Perimetre', self.accounts_url, 1) self.chg_perimeter_url = '%s%s' % (re.sub('act=([^&=]+)', 'act=ChgPerim', self.accounts_url, 1), '&typeaction=ChgPerim') self.location(self.perimeter_url.format(self.sag)) self.page.check_multiple_perimeters()
def get_thread(self, id, contacts=None, get_profiles=False): """ Get a thread and its messages. The 'contacts' parameters is only used for internal calls. """ thread = None if isinstance(id, Thread): thread = id id = thread.id if not thread: thread = Thread(int(id)) thread.flags = Thread.IS_DISCUSSION full = False else: full = True with self.browser: mails = self.browser.get_thread_mails(id, 100) my_name = self.browser.get_my_name() child = None msg = None s**t = self._get_slut(id) if contacts is None: contacts = {} if not thread.title: thread.title = u'Discussion with %s' % mails['who']['pseudo'] self.storage.set('s***s', int(thread.id), 'status', mails['status']) self.storage.save() for mail in mails['results']: flags = 0 if self.antispam and not self.antispam.check_mail(mail): self.logger.info('Skipped a spam-mail from %s' % mails['who']['pseudo']) self.report_spam(thread.id) break if parse_dt(mail['date']) > s**t['lastmsg']: flags |= Message.IS_UNREAD if get_profiles: if not mail['from'] in contacts: try: with self.browser: contacts[mail['from']] = self.get_contact(mail['from']) except BrowserHTTPNotFound: pass if self.antispam and mail['from'] in contacts and not self.antispam.check_contact(contacts[mail['from']]): self.logger.info('Skipped a spam-mail-profile from %s' % mails['who']['pseudo']) self.report_spam(thread.id) break if int(mail['from']) == self.browser.my_id: if mails['remote_status'] == 'new' and msg is None: flags |= Message.IS_NOT_RECEIVED else: flags |= Message.IS_RECEIVED signature = u'' #if mail.get('src', None): # signature += u'Sent from my %s\n\n' % mail['src'] if mail['from'] in contacts: signature += contacts[mail['from']].get_text() msg = Message(thread=thread, id=int(time.strftime('%Y%m%d%H%M%S', parse_dt(mail['date']).timetuple())), title=thread.title, sender=to_unicode(my_name if int(mail['from']) == self.browser.my_id else mails['who']['pseudo']), receivers=[to_unicode(my_name if int(mail['from']) != self.browser.my_id else mails['who']['pseudo'])], date=parse_dt(mail['date']), content=to_unicode(unescape(mail['message'] or '').strip()), signature=signature, children=[], flags=flags) if child: msg.children.append(child) child.parent = msg child = msg if full and msg: # If we have get all the messages, replace NotLoaded with None as # parent. msg.parent = None if not full and not msg: # Perhaps there are hidden messages msg = NotLoaded thread.root = msg return thread
rsuffix='_other', lsuffix='_original', ) # %% # Assign to shopify_data shopify_data['Handle'] = jointed['slug_other'] shopify_data['Title'] = wc_data['Name'] shopify_data['Variant SKU'] = wc_data['SKU'] shopify_data['Body (HTML)'] = wc_data['Description'] shopify_data['Vendor'] = wc_data['Manufacturer'] wc_data['new_tags'] = wc_data[['Tags', 'Categories']].fillna(value='').apply( lambda x: re.split('>|,', (x['Categories'] + x['Tags']).strip()), axis=1) shopify_data['Tags'] = wc_data['new_tags'].apply( lambda x: unescape(','.join(np.unique([y.strip() for y in x])))) published_dict = defaultdict(lambda: 'FALSE') published_dict[1] = 'TRUE' shopify_data['Published'] = wc_data['Published'].map(published_dict) shopify_data['WC Type'] = wc_data['Type'] is_variation = shopify_data['WC Type'] == 'variation' is_simple = shopify_data['WC Type'] == 'simple' is_variable = shopify_data['WC Type'] == 'variable' is_not_variation = shopify_data['WC Type'] != 'variation' wc_data['packaging'].fillna(value='') shopify_data['Option1 Name'] = 'Packaging' shopify_data['Option1 Value'] = wc_data['packaging']
def get_thread(self, id, contacts=None, get_profiles=False): """ Get a thread and its messages. The 'contacts' parameters is only used for internal calls. """ thread = None if isinstance(id, Thread): thread = id id = thread.id if not thread: thread = Thread(int(id)) thread.flags = Thread.IS_DISCUSSION full = False else: full = True with self.browser: mails = self.browser.get_thread_mails(id, 100) my_name = self.browser.get_my_name() child = None msg = None s**t = self._get_slut(mails["member"]["pseudo"]) if contacts is None: contacts = {} if not thread.title: thread.title = u"Discussion with %s" % mails["member"]["pseudo"] for mail in mails["messages"]: flags = Message.IS_HTML if parse_dt(mail["date"]) > s**t["lastmsg"] and mail["id_from"] != self.browser.get_my_name(): flags |= Message.IS_UNREAD if get_profiles: if not mail["id_from"] in contacts: with self.browser: contacts[mail["id_from"]] = self.get_contact(mail["id_from"]) signature = u"" if mail.get("src", None): signature += u"Sent from my %s\n\n" % mail["src"] if mail["id_from"] in contacts: signature += contacts[mail["id_from"]].get_text() msg = Message( thread=thread, id=int(time.strftime("%Y%m%d%H%M%S", parse_dt(mail["date"]).timetuple())), title=thread.title, sender=mail["id_from"], receivers=[my_name if mail["id_from"] != my_name else mails["member"]["pseudo"]], date=parse_dt(mail["date"]), content=unescape(mail["message"]).strip(), signature=signature, children=[], flags=flags, ) if child: msg.children.append(child) child.parent = msg child = msg if full and msg: # If we have get all the messages, replace NotLoaded with None as # parent. msg.parent = None if not full and not msg: # Perhaps there are hidden messages msg = NotLoaded thread.root = msg return thread
def get_thread(self, id, contacts=None, get_profiles=False): """ Get a thread and its messages. The 'contacts' parameters is only used for internal calls. """ thread = None if isinstance(id, Thread): thread = id id = thread.id if not thread: thread = Thread(int(id)) thread.flags = Thread.IS_DISCUSSION full = False else: full = True with self.browser: mails = self.browser.get_thread_mails(id, 100) my_name = self.browser.get_my_name() child = None msg = None s**t = self._get_slut(id) if contacts is None: contacts = {} if not thread.title: thread.title = u'Discussion with %s' % mails['who']['pseudo'] self.storage.set('s***s', int(thread.id), 'status', mails['status']) self.storage.save() for mail in mails['results']: flags = 0 if self.antispam and not self.antispam.check_mail(mail): self.logger.info('Skipped a spam-mail from %s' % mails['who']['pseudo']) self.report_spam(thread.id) break if parse_dt(mail['date']) > s**t['lastmsg']: flags |= Message.IS_UNREAD if get_profiles: if not mail['from'] in contacts: try: with self.browser: contacts[mail['from']] = self.get_contact( mail['from']) except BrowserHTTPNotFound: pass if self.antispam and mail[ 'from'] in contacts and not self.antispam.check_contact( contacts[mail['from']]): self.logger.info( 'Skipped a spam-mail-profile from %s' % mails['who']['pseudo']) self.report_spam(thread.id) break if int(mail['from']) == self.browser.my_id: if mails['remote_status'] == 'new' and msg is None: flags |= Message.IS_NOT_RECEIVED else: flags |= Message.IS_RECEIVED signature = u'' #if mail.get('src', None): # signature += u'Sent from my %s\n\n' % mail['src'] if mail['from'] in contacts: signature += contacts[mail['from']].get_text() msg = Message( thread=thread, id=int( time.strftime('%Y%m%d%H%M%S', parse_dt(mail['date']).timetuple())), title=thread.title, sender=to_unicode(my_name if int(mail['from']) == self.browser. my_id else mails['who']['pseudo']), receivers=[ to_unicode(my_name if int(mail['from']) != self.browser. my_id else mails['who']['pseudo']) ], date=parse_dt(mail['date']), content=to_unicode(unescape(mail['message'] or '').strip()), signature=signature, children=[], flags=flags) if child: msg.children.append(child) child.parent = msg child = msg if full and msg: # If we have get all the messages, replace NotLoaded with None as # parent. msg.parent = None if not full and not msg: # Perhaps there are hidden messages msg = NotLoaded thread.root = msg return thread
def run(num=None): feeds, feedfileObject = load() smtpserver = None try: # We store the default to address as the first item in the feeds list. # Here we take it out and save it for later. default_to = "" if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] else: ifeeds = feeds if num: ifeeds = [feeds[num]] feednum = 0 for f in ifeeds: try: feednum += 1 if not f.active: continue if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url) r = {} try: r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified) except TimeoutError: print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url) continue # Handle various status conditions, as required if 'status' in r: if r.status == 301: f.url = r['url'] elif r.status == 410: print >>warn, "W: feed gone; deleting", f.url feeds.remove(f) continue http_status = r.get('status', 200) if VERBOSE > 1: print >>warn, "I: http status", http_status http_headers = r.get('headers', { 'content-type': 'application/rss+xml', 'content-length':'1'}) exc_type = r.get("bozo_exception", Exception()).__class__ if http_status != 304 and not r.entries and not r.get('version', ''): if http_status not in [200, 302]: print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url) elif contains(http_headers.get('content-type', 'rss'), 'html'): print >>warn, "W: looks like HTML [%d] %s" % (feednum, f.url) elif http_headers.get('content-length', '1') == '0': print >>warn, "W: empty page [%d] %s" % (feednum, f.url) elif hasattr(socket, 'timeout') and exc_type == socket.timeout: print >>warn, "W: timed out on [%d] %s" % (feednum, f.url) elif exc_type == IOError: print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url) elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error: print >>warn, "W: broken compression [%d] %s" % (feednum, f.url) elif exc_type in socket_errors: exc_reason = r.bozo_exception.args[1] print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) elif exc_type == urllib2.URLError: if r.bozo_exception.reason.__class__ in socket_errors: exc_reason = r.bozo_exception.reason.args[1] else: exc_reason = r.bozo_exception.reason print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) elif exc_type == AttributeError: print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url) elif exc_type == KeyboardInterrupt: raise r.bozo_exception elif r.bozo: print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process")) else: print >>warn, "=== rss2email encountered a problem with this feed ===" print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ===" print >>warn, "=== If this occurs repeatedly, send this to [email protected] ===" print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url print >>warn, r print >>warn, "rss2email", __version__ print >>warn, "feedparser", feedparser.__version__ print >>warn, "html2text", h2t.__version__ print >>warn, "Python", sys.version print >>warn, "=== END HERE ===" continue r.entries.reverse() for entry in r.entries: id = getID(entry) # If TRUST_GUID isn't set, we get back hashes of the content. # Instead of letting these run wild, we put them in context # by associating them with the actual ID (if it exists). frameid = entry.get('id') if not(frameid): frameid = id if type(frameid) is DictType: frameid = frameid.values()[0] # If this item's ID is in our database # then it's already been sent # and we don't need to do anything more. if frameid in f.seen: if f.seen[frameid] == id: continue if not (f.to or default_to): print "No default email address defined. Please run 'r2e email emailaddress'" print "Ignoring feed %s" % f.url break if 'title_detail' in entry and entry.title_detail: title = entry.title_detail.value if contains(entry.title_detail.type, 'html'): title = html2text(title) else: title = getContent(entry)[:70] title = title.replace("\n", " ").strip() datetime = time.gmtime() if DATE_HEADER: for datetype in DATE_HEADER_ORDER: kind = datetype+"_parsed" if kind in entry and entry[kind]: datetime = entry[kind] link = entry.get('link', "") from_addr = getEmail(r, entry) name = h2t.unescape(getName(r, entry)) fromhdr = formataddr((name, from_addr,)) tohdr = (f.to or default_to) subjecthdr = title datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime) useragenthdr = "rss2email" # Add post tags, if available tagline = getTags(entry) extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline} if BONUS_HEADER != '': for hdr in BONUS_HEADER.strip().splitlines(): pos = hdr.strip().find(':') if pos > 0: extraheaders[hdr[:pos]] = hdr[pos+1:].strip() else: print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER entrycontent = getContent(entry, HTMLOK=HTML_MAIL) contenttype = 'plain' content = '' if USE_CSS_STYLING and HTML_MAIL: contenttype = 'html' content = "<html>\n" content += '<head><style><!--' + STYLE_SHEET + '//--></style></head>\n' content += '<body>\n' content += '<div id="entry">\n' content += '<h1' content += ' class="header"' content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n' if ishtml(entrycontent): body = entrycontent[1].strip() else: body = entrycontent.strip() if body != '': content += '<div id="body"><table><tr><td>\n' + body + '</td></tr></table></div>\n' content += '\n<p class="footer">URL: <a href="'+link+'">'+link+'</a>' if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if (hasattr(enclosure, 'url') and enclosure.url != ""): content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n") if (hasattr(enclosure, 'src') and enclosure.src != ""): content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n') if 'links' in entry: for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': extraurl = extralink['href'] extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/') viatitle = extraurl if ('title' in extralink): viatitle = extralink['title'] content += '<br/>Via: <a href="'+extraurl+'">'+viatitle+'</a>\n' content += '</p></div>\n' content += "\n\n</body></html>" else: if ishtml(entrycontent): contenttype = 'html' content = "<html>\n" content = ("<html><body>\n\n" + '<h1><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n' + entrycontent[1].strip() + # drop type tag (HACK: bad abstraction) '<p>URL: <a href="'+link+'">'+link+'</a></p>' ) if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if enclosure.url != "": content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n") if 'links' in entry: for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n' content += ("\n</body></html>") else: content = entrycontent.strip() + "\n\nURL: "+link if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if enclosure.url != "": content += ('\nEnclosure: ' + enclosure.url + "\n") if 'links' in entry: for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n' smtpserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, extraheaders, smtpserver) f.seen[frameid] = id f.etag, f.modified = r.get('etag', None), r.get('modified', None) except (KeyboardInterrupt, SystemExit): raise except: print >>warn, "=== rss2email encountered a problem with this feed ===" print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ===" print >>warn, "=== If this occurs repeatedly, send this to [email protected] ===" print >>warn, "E: could not parse", f.url traceback.print_exc(file=warn) print >>warn, "rss2email", __version__ print >>warn, "feedparser", feedparser.__version__ print >>warn, "html2text", h2t.__version__ print >>warn, "Python", sys.version print >>warn, "=== END HERE ===" continue finally: unlock(feeds, feedfileObject) if smtpserver: smtpserver.quit()
def run(num=None): feeds, feedfileObject = load() try: # We store the default to address as the first item in the feeds list. # Here we take it out and save it for later. default_to = "" if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] else: ifeeds = feeds if num: ifeeds = [feeds[num]] feednum = 0 smtpserver = None for f in ifeeds: try: feednum += 1 if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, hidepass(f.url)) r = {} try: r = parse(f.url, f.etag, f.modified, FEED_TIMEOUT) except TimeoutError: print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, hidepass(f.url)) continue # Handle various status conditions, as required if 'status' in r: if r.status == 301: f.url = r['url'] elif r.status == 410: print >>warn, "W: feed gone; deleting", hidepass(f.url) feeds.remove(f) continue http_status = r.get('status', 200) http_headers = r.get('headers', { 'content-type': 'application/rss+xml', 'content-length':'1'}) exc_type = r.get("bozo_exception", Exception()).__class__ if http_status != 304 and not r.get('version', ''): if http_status not in [200, 302]: print >>warn, "W: error %d [%d] %s" % (http_status, feednum, hidepass(f.url)) elif contains(http_headers.get('content-type', 'rss'), 'html'): print >>warn, "W: looks like HTML [%d] %s" % (feednum, hidepass(f.url)) elif http_headers.get('content-length', '1') == '0': print >>warn, "W: empty page [%d] %s" % (feednum, hidepass(f.url)) elif hasattr(socket, 'timeout') and exc_type == socket.timeout: print >>warn, "W: timed out on [%d] %s" % (feednum, hidepass(f.url)) elif exc_type == IOError: print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, hidepass(f.url)) elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error: print >>warn, "W: broken compression [%d] %s" % (feednum, hidepass(f.url)) elif exc_type in socket_errors: exc_reason = r.bozo_exception.args[1] print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, hidepass(f.url)) elif exc_type == urllib2.URLError: if r.bozo_exception.reason.__class__ in socket_errors: exc_reason = r.bozo_exception.reason.args[1] else: exc_reason = r.bozo_exception.reason print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, hidepass(f.url)) elif exc_type == AttributeError: print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, hidepass(f.url)) elif exc_type == KeyboardInterrupt: raise r.bozo_exception elif r.bozo: print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, hidepass(f.url), r.get("bozo_exception", "can't process")) else: print >>warn, "=== SEND THE FOLLOWING TO [email protected] ===" print >>warn, "E:", r.get("bozo_exception", "can't process"), hidepass(f.url) print >>warn, r print >>warn, "rss2email", __version__ print >>warn, "feedparser", feedparser.__version__ print >>warn, "html2text", h2t.__version__ print >>warn, "Python", sys.version print >>warn, "=== END HERE ===" continue r.entries.reverse() for entry in r.entries: id = getID(entry) # If TRUST_GUID isn't set, we get back hashes of the content. # Instead of letting these run wild, we put them in context # by associating them with the actual ID (if it exists). frameid = entry.get('id', id) # If this item's ID is in our database # then it's already been sent # and we don't need to do anything more. if f.seen.has_key(frameid) and f.seen[frameid] == id: continue if not (f.to or default_to): print "No default email address defined. Please run 'r2e email emailaddress'" print "Ignoring feed %s" % hidepass(f.url) break if 'title_detail' in entry and entry.title_detail: title = entry.title_detail.value if contains(entry.title_detail.type, 'html'): title = html2text(title) else: title = getContent(entry)[:70] title = title.replace("\n", " ").strip() datetime = time.gmtime() if DATE_HEADER: for datetype in DATE_HEADER_ORDER: kind = datetype+"_parsed" if kind in entry and entry[kind]: datetime = entry[kind] link = entry.get('link', "") from_addr = getEmail(r.feed, entry) name = getName(r, entry) entrycontent = getContent(entry, HTMLOK=HTML_MAIL) force_html = USE_CSS_STYLING and HTML_MAIL if force_html or ishtml(entrycontent): contenttype = 'html' enctpl = TPL_ENCLOSURE_HTML if force_html: tpl = TPL_HTML_CSS else: tpl = TPL_HTML_PLAIN else: contenttype = 'plain' enctpl = TPL_ENCLOSURE_TEXT tpl = TPL_TEXT if hasattr(entry,'enclosures'): encs = [enclosure.url for enclosure in entry.enclosures if enclosure.url != ""] else: encs = () substs = { 'STYLE_SHEET': STYLE_SHEET, 'link': link, 'title': title, } if ishtml(entrycontent): substs['body'] = entrycontent[1].strip() else: substs['body'] = entrycontent.strip() if len(encs): substs['ENCLOSURE_TPL'] = ( enctpl[0] + "".join([enctpl[1] % dict(substs.items() + {'enclosure': eurl}.items()) for eurl in encs]) + enctpl[2]) else: substs['ENCLOSURE_TPL'] = "" fromhdr = '"'+ name + '" <' + from_addr + ">" tohdr = (f.to or default_to) subjecthdr = h2t.unescape(title) content = tpl % substs extraheaders = { 'Date': time.strftime("%a, %d %b %Y %H:%M:%S -0000", datetime), 'User-Agent': "rss2email/"+__version__ } if FEED_URL_HEADER: extraheaders[FEED_URL_HEADER] = hidepass(f.url) if BONUS_HEADER != '': for hdr in BONUS_HEADER.strip().splitlines(): pos = hdr.strip().find(':') if pos > 0: extraheaders[hdr[:pos]] = hdr[pos+1:].strip() else: print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER smtpserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, extraheaders, smtpserver) f.seen[frameid] = id f.etag, f.modified = r.get('etag', None), r.get('modified', None) except (KeyboardInterrupt, SystemExit): raise except: print >>warn, "=== SEND THE FOLLOWING TO [email protected] ===" print >>warn, "E: could not parse", hidepass(f.url) traceback.print_exc(file=warn) print >>warn, "rss2email", __version__ print >>warn, "feedparser", feedparser.__version__ print >>warn, "html2text", h2t.__version__ print >>warn, "Python", sys.version print >>warn, "=== END HERE ===" continue finally: unlock(feeds, feedfileObject) if smtpserver: smtpserver.quit()
def run(num=None): feeds, feedfileObject = load() mailserver = None try: # We store the default to address as the first item in the feeds list. # Here we take it out and save it for later. default_to = "" if feeds and isstr(feeds[0]): default_to = feeds[0]; ifeeds = feeds[1:] else: ifeeds = feeds if num: ifeeds = [feeds[num]] feednum = 0 for f in ifeeds: try: feednum += 1 if not f.active: continue if VERBOSE: print >>warn, 'I: Processing [%d] "%s"' % (feednum, f.url) r = {} try: r = timelimit(FEED_TIMEOUT, parse)(f.url, f.etag, f.modified) except TimeoutError: print >>warn, 'W: feed [%d] "%s" timed out' % (feednum, f.url) continue # Handle various status conditions, as required if 'status' in r: if r.status == 301: f.url = r['url'] elif r.status == 410: print >>warn, "W: feed gone; deleting", f.url feeds.remove(f) continue http_status = r.get('status', 200) if VERBOSE > 1: print >>warn, "I: http status", http_status http_headers = r.get('headers', { 'content-type': 'application/rss+xml', 'content-length':'1'}) exc_type = r.get("bozo_exception", Exception()).__class__ if http_status != 304 and not r.entries and not r.get('version', ''): if http_status not in [200, 302]: print >>warn, "W: error %d [%d] %s" % (http_status, feednum, f.url) elif contains(http_headers.get('content-type', 'rss'), 'html'): print >>warn, "W: looks like HTML [%d] %s" % (feednum, f.url) elif http_headers.get('content-length', '1') == '0': print >>warn, "W: empty page [%d] %s" % (feednum, f.url) elif hasattr(socket, 'timeout') and exc_type == socket.timeout: print >>warn, "W: timed out on [%d] %s" % (feednum, f.url) elif exc_type == IOError: print >>warn, 'W: "%s" [%d] %s' % (r.bozo_exception, feednum, f.url) elif hasattr(feedparser, 'zlib') and exc_type == feedparser.zlib.error: print >>warn, "W: broken compression [%d] %s" % (feednum, f.url) elif exc_type in socket_errors: exc_reason = r.bozo_exception.args[1] print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) elif exc_type == urllib2.URLError: if r.bozo_exception.reason.__class__ in socket_errors: exc_reason = r.bozo_exception.reason.args[1] else: exc_reason = r.bozo_exception.reason print >>warn, "W: %s [%d] %s" % (exc_reason, feednum, f.url) elif exc_type == AttributeError: print >>warn, "W: %s [%d] %s" % (r.bozo_exception, feednum, f.url) elif exc_type == KeyboardInterrupt: raise r.bozo_exception elif r.bozo: print >>warn, 'E: error in [%d] "%s" feed (%s)' % (feednum, f.url, r.get("bozo_exception", "can't process")) else: print >>warn, "=== rss2email encountered a problem with this feed ===" print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ===" print >>warn, "=== If this occurs repeatedly, send this to [email protected] ===" print >>warn, "E:", r.get("bozo_exception", "can't process"), f.url print >>warn, r print >>warn, "rss2email", __version__ print >>warn, "feedparser", feedparser.__version__ print >>warn, "html2text", h2t.__version__ print >>warn, "Python", sys.version print >>warn, "=== END HERE ===" continue r.entries.reverse() for entry in r.entries: id = getID(entry) # If TRUST_GUID isn't set, we get back hashes of the content. # Instead of letting these run wild, we put them in context # by associating them with the actual ID (if it exists). frameid = entry.get('id') if not(frameid): frameid = id if type(frameid) is DictType: frameid = frameid.values()[0] # If this item's ID is in our database # then it's already been sent # and we don't need to do anything more. if frameid in f.seen: if f.seen[frameid] == id: continue if not (f.to or default_to): print "No default email address defined. Please run 'r2e email emailaddress'" print "Ignoring feed %s" % f.url break if 'title_detail' in entry and entry.title_detail: title = entry.title_detail.value if contains(entry.title_detail.type, 'html'): title = html2text(title) else: title = getContent(entry)[:70] title = title.replace("\n", " ").strip() when = time.gmtime() if DATE_HEADER: for datetype in DATE_HEADER_ORDER: kind = datetype+"_parsed" if kind in entry and entry[kind]: when = entry[kind] link = entry.get('link', "") from_addr = getEmail(r, entry) name = h2t.unescape(getName(r, entry)) fromhdr = formataddr((name, from_addr,)) tohdr = (f.to or default_to) subjecthdr = title datehdr = time.strftime("%a, %d %b %Y %H:%M:%S -0000", when) useragenthdr = "rss2email" # Add post tags, if available tagline = "" if 'tags' in entry: tags = entry.get('tags') taglist = [] if tags: for tag in tags: taglist.append(tag['term']) if taglist: tagline = ",".join(taglist) extraheaders = {'Date': datehdr, 'User-Agent': useragenthdr, 'X-RSS-Feed': f.url, 'Message-ID': '<%s>' % hashlib.sha1(id.encode('utf-8')).hexdigest(), 'X-RSS-ID': id, 'X-RSS-URL': link, 'X-RSS-TAGS' : tagline, 'X-MUNGED-FROM': getMungedFrom(r), 'References': ''} if BONUS_HEADER != '': for hdr in BONUS_HEADER.strip().splitlines(): pos = hdr.strip().find(':') if pos > 0: extraheaders[hdr[:pos]] = hdr[pos+1:].strip() else: print >>warn, "W: malformed BONUS HEADER", BONUS_HEADER entrycontent = getContent(entry, HTMLOK=HTML_MAIL) contenttype = 'plain' content = '' if THREAD_ON_TAGS and len(tagline): extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(t.strip().encode('utf-8')).hexdigest() for t in tagline.split(',')]) if USE_CSS_STYLING and HTML_MAIL: contenttype = 'html' content = "<html>\n" content += '<head><meta http-equiv="Content-Type" content="text/html"><style>' + STYLE_SHEET + '</style></head>\n' content += '<body style="word-wrap: break-word; -webkit-nbsp-mode: space; -webkit-line-break: after-white-space;">\n' content += '<div id="entry">\n' content += '<h1 class="header"' content += '><a href="'+link+'">'+subjecthdr+'</a></h1>\n' if ishtml(entrycontent): body = entrycontent[1].strip() if SUMMARIZE: content += '<div class="summary">%s</div>' % (summarize(html2text(body, plaintext=True), SUMMARIZE) + "<hr>") else: body = entrycontent.strip() if SUMMARIZE: content += '<div class="summary">%s</div>' % (summarize(body, SUMMARIZE) + "<hr>") if THREAD_ON_LINKS: parser = Parser() parser.feed(body) extraheaders['References'] += ''.join([' <%s>' % hashlib.sha1(h.strip().encode('utf-8')).hexdigest() for h in parser.attrs]) if INLINE_IMAGES_DATA_URI: parser = Parser(tag='img', attr='src') parser.feed(body) for src in parser.attrs: try: img = feedparser._open_resource(src, None, None, feedparser.USER_AGENT, link, [], {}) data = img.read() if hasattr(img, 'headers'): headers = dict((k.lower(), v) for k, v in dict(img.headers).items()) ctype = headers.get('content-type', None) if ctype and INLINE_IMAGES_DATA_URI: body = body.replace(src,'data:%s;base64,%s' % (ctype, base64.b64encode(data))) except: print >>warn, "Could not load image: %s" % src pass if body != '': content += '<div id="body">\n' + body + '</div>\n' content += '\n<p class="footer">URL: <a href="'+link+'">'+link+'</a>' if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if (hasattr(enclosure, 'url') and enclosure.url != ""): content += ('<br/>Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a>\n") if (hasattr(enclosure, 'src') and enclosure.src != ""): content += ('<br/>Enclosure: <a href="'+enclosure.src+'">'+enclosure.src+'</a><br/><img src="'+enclosure.src+'"\n') if 'links' in entry: for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': extraurl = extralink['href'] extraurl = extraurl.replace('http://www.google.com/reader/public/atom/', 'http://www.google.com/reader/view/') viatitle = extraurl if ('title' in extralink): viatitle = extralink['title'] content += '<br/>Via: <a href="'+extraurl+'">'+viatitle+'</a>\n' content += '</p></div>\n' content += "\n\n</body></html>" else: if ishtml(entrycontent): contenttype = 'html' content = "<html>\n" content = ("<html><body>\n\n" + '<h1><a href="'+link+'">'+subjecthdr+'</a></h1>\n\n' + entrycontent[1].strip() + # drop type tag (HACK: bad abstraction) '<p>URL: <a href="'+link+'">'+link+'</a></p>' ) if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if enclosure.url != "": content += ('Enclosure: <a href="'+enclosure.url+'">'+enclosure.url+"</a><br/>\n") if 'links' in entry: for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': content += 'Via: <a href="'+extralink['href']+'">'+extralink['title']+'</a><br/>\n' content += ("\n</body></html>") else: content = entrycontent.strip() + "\n\nURL: "+link if hasattr(entry,'enclosures'): for enclosure in entry.enclosures: if enclosure.url != "": content += ('\nEnclosure: ' + enclosure.url + "\n") if 'links' in entry: for extralink in entry.links: if ('rel' in extralink) and extralink['rel'] == u'via': content += '<a href="'+extralink['href']+'">Via: '+extralink['title']+'</a>\n' mailserver = send(fromhdr, tohdr, subjecthdr, content, contenttype, when, extraheaders, mailserver, f.folder) f.seen[frameid] = id f.etag, f.modified = r.get('etag', None), r.get('modified', None) except (KeyboardInterrupt, SystemExit): raise except: print >>warn, "=== rss2email encountered a problem with this feed ===" print >>warn, "=== See the rss2email FAQ at http://www.allthingsrss.com/rss2email/ for assistance ===" print >>warn, "=== If this occurs repeatedly, send this to [email protected] ===" print >>warn, "E: could not parse", f.url traceback.print_exc(file=warn) print >>warn, "rss2email", __version__ print >>warn, "feedparser", feedparser.__version__ print >>warn, "html2text", h2t.__version__ print >>warn, "Python", sys.version print >>warn, "=== END HERE ===" continue finally: unlock(feeds, feedfileObject) if mailserver: if IMAP_MARK_AS_READ: for folder in IMAP_MARK_AS_READ: mailserver.select(folder) res, data = mailserver.search(None, '(UNSEEN UNFLAGGED)') if res == 'OK': items = data[0].split() for i in items: res, data = mailserver.fetch(i, "(UID)") if data[0]: u = uid(data[0]) res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Seen)') if IMAP_MOVE_READ_TO: typ, data = mailserver.list(pattern='*') # Parse folder listing as a CSV dialect (automatically removes quotes) reader = csv.reader(StringIO.StringIO('\n'.join(data)),dialect='mailboxlist') # Iterate over each folder for row in reader: folder = row[-1:][0] if folder == IMAP_MOVE_READ_TO or '\Noselect' in row[0]: continue mailserver.select(folder) yesterday = (datetime.now() - timedelta(days=1)).strftime("%d-%b-%Y") res, data = mailserver.search(None, '(SEEN BEFORE %s UNFLAGGED)' % yesterday) if res == 'OK': items = data[0].split() for i in items: res, data = mailserver.fetch(i, "(UID)") if data[0]: u = uid(data[0]) res, data = mailserver.uid('COPY', u, IMAP_MOVE_READ_TO) if res == 'OK': res, data = mailserver.uid('STORE', u, '+FLAGS', '(\Deleted)') mailserver.expunge() try: mailserver.quit() except: mailserver.logout()
def text(node): if node is not None: return unescape(node.text)
def get_thread(self, id, contacts=None, get_profiles=False): """ Get a thread and its messages. The 'contacts' parameters is only used for internal calls. """ thread = None if isinstance(id, Thread): thread = id id = thread.id if not thread: thread = Thread(int(id)) thread.flags = Thread.IS_DISCUSSION full = False else: full = True with self.browser: mails = self.browser.get_thread_mails(id, 100) my_name = self.browser.get_my_name() child = None msg = None s**t = self._get_slut(mails['member']['pseudo']) if contacts is None: contacts = {} if not thread.title: thread.title = u'Discussion with %s' % mails['member']['pseudo'] for mail in mails['messages']: flags = Message.IS_HTML if parse_dt(mail['date']) > s**t['lastmsg'] and mail[ 'id_from'] != self.browser.get_my_name(): flags |= Message.IS_UNREAD if get_profiles: if not mail['id_from'] in contacts: with self.browser: contacts[mail['id_from']] = self.get_contact( mail['id_from']) signature = u'' if mail.get('src', None): signature += u'Sent from my %s\n\n' % mail['src'] if mail['id_from'] in contacts: signature += contacts[mail['id_from']].get_text() msg = Message(thread=thread, id=int( time.strftime('%Y%m%d%H%M%S', parse_dt( mail['date']).timetuple())), title=thread.title, sender=mail['id_from'], receivers=[ my_name if mail['id_from'] != my_name else mails['member']['pseudo'] ], date=parse_dt(mail['date']), content=unescape(mail['message']).strip(), signature=signature, children=[], flags=flags) if child: msg.children.append(child) child.parent = msg child = msg if full and msg: # If we have get all the messages, replace NotLoaded with None as # parent. msg.parent = None if not full and not msg: # Perhaps there are hidden messages msg = NotLoaded thread.root = msg return thread